Compare commits
687 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0dc68b1aef | ||
|
|
98d8a8b871 | ||
|
|
50af394272 | ||
|
|
375c106fcc | ||
|
|
d1439a10a8 | ||
|
|
c44eb3ab91 | ||
|
|
fd3baf68f1 | ||
|
|
89216e239f | ||
|
|
6ba53329e5 | ||
|
|
0958fb35ae | ||
|
|
5a91ded214 | ||
|
|
7606bf8156 | ||
|
|
1bd0f9eecd | ||
|
|
8e9b7e2c67 | ||
|
|
063bebe7d3 | ||
|
|
7ff704a13f | ||
|
|
b684b5fada | ||
|
|
8756d5b160 | ||
|
|
cd0099f2a1 | ||
|
|
fa205cdaf8 | ||
|
|
f51e1893fe | ||
|
|
da98e84b19 | ||
|
|
a6c35a8d74 | ||
|
|
d747649892 | ||
|
|
ee8f189bba | ||
|
|
177259a0a7 | ||
|
|
173ef11681 | ||
|
|
47d6d09081 | ||
|
|
48c461ea85 | ||
|
|
2a46918c66 | ||
|
|
2db1673585 | ||
|
|
4a301240bd | ||
|
|
91fedd85b0 | ||
|
|
4f43f1d0ac | ||
|
|
d0ecb0cbc7 | ||
|
|
fcb7eaa555 | ||
|
|
38b773d80b | ||
|
|
9f62553f23 | ||
|
|
dba782e985 | ||
|
|
4a456b2a75 | ||
|
|
7d23ea7e9e | ||
|
|
b942005931 | ||
|
|
1456585249 | ||
|
|
b3f3e7d0cb | ||
|
|
77434964ab | ||
|
|
f18852376f | ||
|
|
0c38a916fe | ||
|
|
f97c4ccb60 | ||
|
|
d867579a69 | ||
|
|
f378fac6a1 | ||
|
|
4a15939c13 | ||
|
|
380e54a753 | ||
|
|
0825ab36f0 | ||
|
|
cfbf3595c7 | ||
|
|
39751f8786 | ||
|
|
a3fe14d6c6 | ||
|
|
cd57ea2784 | ||
|
|
0b17caaa27 | ||
|
|
b47725a65b | ||
|
|
c70022e6c4 | ||
|
|
c56c1b9482 | ||
|
|
0772b51c2c | ||
|
|
4695fa3c2a | ||
|
|
7a74c9523a | ||
|
|
0eb6240fd0 | ||
|
|
a7e79e089b | ||
|
|
7be496a051 | ||
|
|
5f2b2a6417 | ||
|
|
c3ec8ee76f | ||
|
|
5a49eb06ca | ||
|
|
1b07f86eb8 | ||
|
|
b2e68b8dc7 | ||
|
|
2d2f92631c | ||
|
|
f761432c11 | ||
|
|
fbf2707561 | ||
|
|
a06410055c | ||
|
|
ccd4b4be00 | ||
|
|
855be97011 | ||
|
|
a4840b0268 | ||
|
|
f3c5d9c1b6 | ||
|
|
c1b2d9cb86 | ||
|
|
115c63bcde | ||
|
|
162e91c5ca | ||
|
|
14040123e8 | ||
|
|
ea883b30a5 | ||
|
|
e25b2c4968 | ||
|
|
3b67028ad6 | ||
|
|
4f4a5409d7 | ||
|
|
88112f3d74 | ||
|
|
375192efa1 | ||
|
|
2936378b76 | ||
|
|
39fa45debe | ||
|
|
2557d81b3b | ||
|
|
91429bd63d | ||
|
|
ff95d6d0ab | ||
|
|
5473994a42 | ||
|
|
3c260c545d | ||
|
|
edca27fa32 | ||
|
|
db922e8c88 | ||
|
|
6ceb3438be | ||
|
|
0abb4338a9 | ||
|
|
7479cc68a7 | ||
|
|
e384f549f4 | ||
|
|
e57043ce62 | ||
|
|
8233d589b6 | ||
|
|
88e7c6012b | ||
|
|
b708543309 | ||
|
|
1678a6fbdb | ||
|
|
45e6a6bbad | ||
|
|
d04f7005de | ||
|
|
43c860b6cc | ||
|
|
5575257b08 | ||
|
|
9a75daa388 | ||
|
|
a1c0ee0e66 | ||
|
|
811faa7bda | ||
|
|
c870ef49da | ||
|
|
bd2a4db26c | ||
|
|
b05d5d3f24 | ||
|
|
28807733c3 | ||
|
|
423764ca2e | ||
|
|
49ef81edb6 | ||
|
|
6ce57d9cf8 | ||
|
|
29b73897f8 | ||
|
|
0ab719b59b | ||
|
|
de60db863b | ||
|
|
5d169afd7e | ||
|
|
13a341b88d | ||
|
|
8252d0d9f5 | ||
|
|
b67902ebdd | ||
|
|
2ca4016a1f | ||
|
|
425a5dd094 | ||
|
|
730bd72056 | ||
|
|
6e370b90fd | ||
|
|
c09c02300a | ||
|
|
96c43cf197 | ||
|
|
376ba6912e | ||
|
|
ad8766dfa4 | ||
|
|
476a6842ea | ||
|
|
c5dedeb318 | ||
|
|
07d62a4b89 | ||
|
|
84ab71dd7e | ||
|
|
bf19d821e0 | ||
|
|
09ed3f10cc | ||
|
|
28060d5595 | ||
|
|
5e9f4dc973 | ||
|
|
92e904dec9 | ||
|
|
68b666d7e5 | ||
|
|
2fc9dcc549 | ||
|
|
3d50a6a425 | ||
|
|
5169d08735 | ||
|
|
98ec6df168 | ||
|
|
e43830955f | ||
|
|
f28b7ed0cd | ||
|
|
9bc3d16599 | ||
|
|
1c4ed67779 | ||
|
|
6e9017c474 | ||
|
|
470ac2b46f | ||
|
|
485b30027f | ||
|
|
d9fe9c5d8a | ||
|
|
af166bf0a0 | ||
|
|
3a18b68f5f | ||
|
|
1b346d7041 | ||
|
|
8ddffb36e1 | ||
|
|
9cfe4bc6fe | ||
|
|
311b1761c9 | ||
|
|
60dd75745f | ||
|
|
fe7cdcefb4 | ||
|
|
13829329bd | ||
|
|
51ee382517 | ||
|
|
77fab79d83 | ||
|
|
2e9e6c82f9 | ||
|
|
7e839c5c9e | ||
|
|
bf50d25ea1 | ||
|
|
83e61bf99e | ||
|
|
e68e9659ab | ||
|
|
cb5171914e | ||
|
|
25c4fbd0cb | ||
|
|
4fb6153eed | ||
|
|
a2216c12a0 | ||
|
|
0a0951ba12 | ||
|
|
42e1fd8fff | ||
|
|
309fb90a5d | ||
|
|
7f2628acd7 | ||
|
|
ee4096d23e | ||
|
|
7b3fd92015 | ||
|
|
ce5930c365 | ||
|
|
f91ce704f3 | ||
|
|
8c7b18daed | ||
|
|
d1969b4c03 | ||
|
|
1dd96b6cdc | ||
|
|
7491413de5 | ||
|
|
7114d6681a | ||
|
|
34e01642ca | ||
|
|
b8bc85b534 | ||
|
|
4db3dfee7d | ||
|
|
ae31bc21bc | ||
|
|
b2f98db74e | ||
|
|
bde25d6694 | ||
|
|
e837b339cc | ||
|
|
01053f8f2f | ||
|
|
8fc5693ef6 | ||
|
|
3d36fa8f4e | ||
|
|
b59018aa05 | ||
|
|
282a64c252 | ||
|
|
5268c19b6b | ||
|
|
f5659e17d5 | ||
|
|
af047e9f8c | ||
|
|
d25efb6468 | ||
|
|
ebbde5c343 | ||
|
|
e74628f5d4 | ||
|
|
7cb34e3ad6 | ||
|
|
996645dc17 | ||
|
|
77ae180d3d | ||
|
|
0052b193cf | ||
|
|
635645c650 | ||
|
|
231a6e7aea | ||
|
|
562fe8078b | ||
|
|
a3a4439dec | ||
|
|
95cc900b1f | ||
|
|
190e58a8c6 | ||
|
|
5f0f8749d9 | ||
|
|
8bf6525394 | ||
|
|
117f26f865 | ||
|
|
b0f38e9352 | ||
|
|
f9e1b2b7b7 | ||
|
|
96f221e0d0 | ||
|
|
e436c94419 | ||
|
|
deb802b2be | ||
|
|
8e1adddc2b | ||
|
|
b894f7c9d6 | ||
|
|
a71ccd8372 | ||
|
|
7d297b418f | ||
|
|
166e878830 | ||
|
|
430be8d4bd | ||
|
|
8676a1bf56 | ||
|
|
4fe2f2fb09 | ||
|
|
7f559235be | ||
|
|
79813097b5 | ||
|
|
e49d06c6bd | ||
|
|
739b3f2c5f | ||
|
|
9e1690defe | ||
|
|
610b70b79e | ||
|
|
15a0d27eed | ||
|
|
888edba03f | ||
|
|
c817efbd8a | ||
|
|
c11d6d5929 | ||
|
|
243fd46df9 | ||
|
|
a0c9ecd289 | ||
|
|
e23f4ec3db | ||
|
|
9cdcc8303b | ||
|
|
c16a6222f3 | ||
|
|
3e648fd1e9 | ||
|
|
b9a9cd9db8 | ||
|
|
5b9e071c18 | ||
|
|
99157ae56a | ||
|
|
6024480400 | ||
|
|
6d35bd2421 | ||
|
|
8bae715994 | ||
|
|
1dcedb23ec | ||
|
|
d7fce99564 | ||
|
|
ce9d7045f9 | ||
|
|
1924e16f45 | ||
|
|
b3bb54da73 | ||
|
|
88b4c64c0d | ||
|
|
89eafa1b97 | ||
|
|
8ddb7b0152 | ||
|
|
111b04e18e | ||
|
|
2e31e97e54 | ||
|
|
56da375165 | ||
|
|
3534147905 | ||
|
|
738e420128 | ||
|
|
b80d5d6b33 | ||
|
|
422febd18e | ||
|
|
68c9252ff7 | ||
|
|
a1ba608641 | ||
|
|
224f574420 | ||
|
|
06f502a1aa | ||
|
|
d60ee84137 | ||
|
|
139feaf97a | ||
|
|
537b34dc6f | ||
|
|
3abbd7b4c7 | ||
|
|
1f19b78287 | ||
|
|
36927632c5 | ||
|
|
607599f2a1 | ||
|
|
b587dd2704 | ||
|
|
4b4ade8342 | ||
|
|
d4d36eed45 | ||
|
|
cb7f331ebc | ||
|
|
c4181e5f2e | ||
|
|
ec2cdafec5 | ||
|
|
755072e378 | ||
|
|
652ff07668 | ||
|
|
24a92808db | ||
|
|
6f046327ac | ||
|
|
eee3046624 | ||
|
|
a16289b204 | ||
|
|
a4ac750eb1 | ||
|
|
981f06b9d1 | ||
|
|
49c1cb6990 | ||
|
|
c0853967d5 | ||
|
|
fd8439ffbc | ||
|
|
7c79c9ac3a | ||
|
|
8ad58139cd | ||
|
|
7b25834667 | ||
|
|
66b9a72d5a | ||
|
|
9bbc3901ee | ||
|
|
f116722e68 | ||
|
|
8e4dc43368 | ||
|
|
00387cb645 | ||
|
|
0f8f8e05b2 | ||
|
|
82c2ba4c44 | ||
|
|
edf4595bc1 | ||
|
|
f1e1cc28ff | ||
|
|
122ec48a89 | ||
|
|
6e2bdcbbbc | ||
|
|
67f3c687b8 | ||
|
|
9c8420a4dc | ||
|
|
e960a09ff4 | ||
|
|
e339cdec52 | ||
|
|
40566cdbba | ||
|
|
30d0d5fb96 | ||
|
|
b758a13813 | ||
|
|
541580d157 | ||
|
|
8a484e990e | ||
|
|
1ca737ed55 | ||
|
|
e9edb03eff | ||
|
|
d5a34339e5 | ||
|
|
32ca060094 | ||
|
|
81d4d4d2c1 | ||
|
|
7a94bdb60c | ||
|
|
3453b6e715 | ||
|
|
1080dc256a | ||
|
|
fc5036a630 | ||
|
|
9d627e2567 | ||
|
|
5dd23a2195 | ||
|
|
956e50686e | ||
|
|
412310ed04 | ||
|
|
d20bfb12e4 | ||
|
|
3dbd4af263 | ||
|
|
7b9b4f821b | ||
|
|
1411d3f37f | ||
|
|
dfb89e3442 | ||
|
|
0c360fe55f | ||
|
|
3109069019 | ||
|
|
dbcb4c8729 | ||
|
|
2859c190cd | ||
|
|
9c39f69559 | ||
|
|
b958c55ac6 | ||
|
|
b943becc61 | ||
|
|
db490d1c75 | ||
|
|
f6f3473d17 | ||
|
|
db692a30e5 | ||
|
|
b0591c8042 | ||
|
|
f5920f8cbd | ||
|
|
05b242d542 | ||
|
|
6c3e4d7d0d | ||
|
|
f28459497d | ||
|
|
e558d45208 | ||
|
|
788741bbcb | ||
|
|
0bca4c8c3b | ||
|
|
5ff0fcc693 | ||
|
|
c49c6565e5 | ||
|
|
a92d21ce24 | ||
|
|
808c0a6dff | ||
|
|
f7d434aec2 | ||
|
|
6af98bec16 | ||
|
|
cf2ec238a4 | ||
|
|
bb6b7ded55 | ||
|
|
bad4a27b9f | ||
|
|
f5eb345c8a | ||
|
|
db0c9e1c2d | ||
|
|
0b143e6d22 | ||
|
|
7f3bc03990 | ||
|
|
1f624a8005 | ||
|
|
030a4e4e25 | ||
|
|
16781ac8f9 | ||
|
|
ae43fd7c7a | ||
|
|
6063d243eb | ||
|
|
bda3282f6d | ||
|
|
48ac946d9f | ||
|
|
0406c64a5d | ||
|
|
b1c94c7d86 | ||
|
|
529b80406c | ||
|
|
13c8d2ba74 | ||
|
|
cbb52b1d5d | ||
|
|
6506a1c490 | ||
|
|
dd3126735b | ||
|
|
424bcc05fa | ||
|
|
62e95dcc60 | ||
|
|
0fe182d3c3 | ||
|
|
0c0e26effa | ||
|
|
2a8c1c677e | ||
|
|
4380641714 | ||
|
|
9ead44531e | ||
|
|
d3bb466026 | ||
|
|
8196d5d680 | ||
|
|
82a43f448e | ||
|
|
eb1b185d70 | ||
|
|
67f40b2629 | ||
|
|
33f1ab3ae1 | ||
|
|
fbf2a5feed | ||
|
|
cb3afeec53 | ||
|
|
c50cf6d7ff | ||
|
|
3a49e1bdb1 | ||
|
|
886955148d | ||
|
|
408c3a62a8 | ||
|
|
d833038ba1 | ||
|
|
78afd6c772 | ||
|
|
f025488294 | ||
|
|
35944a13b4 | ||
|
|
6109a70a16 | ||
|
|
339a53d9d4 | ||
|
|
b3a3228a02 | ||
|
|
92b996513e | ||
|
|
cfcb1fc491 | ||
|
|
a9f884bd47 | ||
|
|
dbc5c9b82d | ||
|
|
3d6c831e8a | ||
|
|
baa3145817 | ||
|
|
632fdc3e19 | ||
|
|
57a43e9da7 | ||
|
|
5773d4d3c4 | ||
|
|
4554da0537 | ||
|
|
635c39c4c3 | ||
|
|
b0be833c75 | ||
|
|
34f0b313af | ||
|
|
c4fa2f6110 | ||
|
|
f305cdbf75 | ||
|
|
6bcf35f2e1 | ||
|
|
3c114262aa | ||
|
|
b8330fc58a | ||
|
|
483a7d05e9 | ||
|
|
8c4c754a72 | ||
|
|
f4a5a8b6cd | ||
|
|
bc6e2af374 | ||
|
|
6231e153e6 | ||
|
|
2dcf263536 | ||
|
|
f258a68029 | ||
|
|
7294ac4fc9 | ||
|
|
cc3c98d9b7 | ||
|
|
30c30d3696 | ||
|
|
5196458305 | ||
|
|
d5d48560a7 | ||
|
|
32009942fd | ||
|
|
00702dc39b | ||
|
|
8e06726f6b | ||
|
|
10273a0288 | ||
|
|
19eef1d0da | ||
|
|
07182444d2 | ||
|
|
5e81a210ce | ||
|
|
db444c4a08 | ||
|
|
70e230815b | ||
|
|
4af680c3b6 | ||
|
|
d24b36adf9 | ||
|
|
a13a3d1552 | ||
|
|
7a3676851d | ||
|
|
a7202ee804 | ||
|
|
3dd40b9f37 | ||
|
|
18e1ddec3c | ||
|
|
b3bffcef34 | ||
|
|
740db8ff02 | ||
|
|
752cf4c95d | ||
|
|
b30aa96a88 | ||
|
|
0f6ad749f5 | ||
|
|
f42e4932fa | ||
|
|
3d38ebbef5 | ||
|
|
889887c2f1 | ||
|
|
7fe8b95833 | ||
|
|
bd1eaa25f2 | ||
|
|
81b1befd10 | ||
|
|
64dd1973b9 | ||
|
|
bf94add992 | ||
|
|
f7bb8fc10f | ||
|
|
014fa02c6a | ||
|
|
e8de5da3a5 | ||
|
|
c43fee541d | ||
|
|
8083c30e7b | ||
|
|
3a091fa302 | ||
|
|
2a01c5c865 | ||
|
|
362fe4e4fa | ||
|
|
60217a2c02 | ||
|
|
c2fec29bfa | ||
|
|
f6fed76e7e | ||
|
|
7560518eec | ||
|
|
53107995bf | ||
|
|
264c636adf | ||
|
|
f9c02aa40f | ||
|
|
11f27beccd | ||
|
|
ebdcd94bf5 | ||
|
|
4a6f4eaac9 | ||
|
|
ebefb78fd4 | ||
|
|
73ec467dd3 | ||
|
|
0a9c8acd6d | ||
|
|
6f01fa50ce | ||
|
|
67d332e0f5 | ||
|
|
5dab410537 | ||
|
|
259dea0777 | ||
|
|
e30c724bd4 | ||
|
|
6f4148faab | ||
|
|
7e16606618 | ||
|
|
c2c5ad2d47 | ||
|
|
1a91b15a6e | ||
|
|
bb13c2cd15 | ||
|
|
033a0c139e | ||
|
|
0d5741bc74 | ||
|
|
899bfbfbae | ||
|
|
2bf0eeb82d | ||
|
|
c870c08b7e | ||
|
|
fa41fe3f13 | ||
|
|
8f6e5e197b | ||
|
|
15286523cf | ||
|
|
d9599f816f | ||
|
|
6062f4dd58 | ||
|
|
24a188588a | ||
|
|
2ab6907fe2 | ||
|
|
f44511e94d | ||
|
|
26675e6dcd | ||
|
|
75c8bdf962 | ||
|
|
efde0eb171 | ||
|
|
f4a47fa78e | ||
|
|
5f9f42292c | ||
|
|
c261b3d1f5 | ||
|
|
cca955fc94 | ||
|
|
0c8c231949 | ||
|
|
d485d1849f | ||
|
|
74055cc15e | ||
|
|
195f90159d | ||
|
|
fc27e2f32d | ||
|
|
f2eb55683c | ||
|
|
9a936721d8 | ||
|
|
eee0d5b065 | ||
|
|
b1dec917c7 | ||
|
|
0dbac3d11e | ||
|
|
f6c82d52ec | ||
|
|
af042f6a24 | ||
|
|
cbdcbfc49c | ||
|
|
e353a2e51c | ||
|
|
a1c7104d7f | ||
|
|
198c5bb55e | ||
|
|
141f9ebf4b | ||
|
|
f29c2f8796 | ||
|
|
5e07367979 | ||
|
|
0ea5b14bd8 | ||
|
|
9eca9bccf4 | ||
|
|
951ba267cf | ||
|
|
1fb5c127b5 | ||
|
|
4a71b0ec19 | ||
|
|
ba63b2886f | ||
|
|
d120167725 | ||
|
|
031b34b121 | ||
|
|
d8fc16538e | ||
|
|
80b6ec4478 | ||
|
|
9203d26a2f | ||
|
|
4cf116ceb6 | ||
|
|
41f30c288e | ||
|
|
b18c7f9466 | ||
|
|
d18492e751 | ||
|
|
86f9f707d8 | ||
|
|
0dfc443252 | ||
|
|
71cd9b9000 | ||
|
|
be95c80aa2 | ||
|
|
b7f355fdd2 | ||
|
|
4a746be43a | ||
|
|
44f839b896 | ||
|
|
35638f6146 | ||
|
|
e402d20876 | ||
|
|
dabb36c006 | ||
|
|
b76db01c66 | ||
|
|
4a37b852a0 | ||
|
|
b0f7ddaa2e | ||
|
|
113285e1dc | ||
|
|
46e9520a28 | ||
|
|
cf89ae64e2 | ||
|
|
3952b525b8 | ||
|
|
0f5f9c0385 | ||
|
|
167544d792 | ||
|
|
1fee7da16f | ||
|
|
048d6929f4 | ||
|
|
57e4f4d426 | ||
|
|
969ea57159 | ||
|
|
c489ce62b2 | ||
|
|
fc75885e9e | ||
|
|
28f8267563 | ||
|
|
9ec4c43dd2 | ||
|
|
46342d4633 | ||
|
|
fd26f45208 | ||
|
|
13aff0d8cd | ||
|
|
af76bbb3f3 | ||
|
|
0fc47f5abb | ||
|
|
4d382a8cc1 | ||
|
|
364abdd6d1 | ||
|
|
761ab7c834 | ||
|
|
b1bcb7183b | ||
|
|
e99ab0d1dd | ||
|
|
f73bcd427d | ||
|
|
7755c00721 | ||
|
|
a735f8cb76 | ||
|
|
cc767add88 | ||
|
|
4d436a3cb0 | ||
|
|
53a18635ee | ||
|
|
f0421e9455 | ||
|
|
93319841ed | ||
|
|
ccf21ec061 | ||
|
|
39913d6ee8 | ||
|
|
fe3464b763 | ||
|
|
af0a451dc4 | ||
|
|
59b91cf205 | ||
|
|
57ec922214 | ||
|
|
1123253f79 | ||
|
|
aba41d07cd | ||
|
|
1581de08da | ||
|
|
0162bb7034 | ||
|
|
c70a73f38d | ||
|
|
2ed40523ab | ||
|
|
009f692f49 | ||
|
|
48e19c1964 | ||
|
|
704d9e0a13 | ||
|
|
6b254ec495 | ||
|
|
561e51871e | ||
|
|
777c5ce992 | ||
|
|
70c5c12067 | ||
|
|
1595d36721 | ||
|
|
37714eb331 | ||
|
|
ad2e93f6c5 | ||
|
|
936190c17c | ||
|
|
9987fb24f8 | ||
|
|
67f0b69a4c | ||
|
|
5568f83a6c | ||
|
|
b08c3c5baa | ||
|
|
7d9ac3f97d | ||
|
|
0bbb4a07b2 | ||
|
|
7a92d4008e | ||
|
|
c51d71b033 | ||
|
|
7cb449c4a7 | ||
|
|
61142f203b | ||
|
|
fbaa3821a4 | ||
|
|
4e8a1c6516 | ||
|
|
8c5d3ac130 | ||
|
|
c110111f52 | ||
|
|
1e03be4e08 | ||
|
|
f91a098770 | ||
|
|
fcca359774 | ||
|
|
00a8076deb | ||
|
|
a6abdccf01 | ||
|
|
ab219d3331 | ||
|
|
2937f5eebc | ||
|
|
e5dd894960 | ||
|
|
bc7f6b37b0 | ||
|
|
36031d9a36 | ||
|
|
27e4cbb215 | ||
|
|
f9ae83e951 | ||
|
|
a55f4d3416 | ||
|
|
733d23aef8 | ||
|
|
8d3a7e1688 | ||
|
|
19b24cf978 | ||
|
|
458585b5fd | ||
|
|
1d57cfb7bd | ||
|
|
bc7241b2a4 | ||
|
|
7d132aefa9 | ||
|
|
a31aaa410c | ||
|
|
da5e62773d | ||
|
|
b1c79323af | ||
|
|
c82101ef16 | ||
|
|
978216d350 | ||
|
|
0c6bfa74b5 | ||
|
|
01175a415a | ||
|
|
a17cb2339e | ||
|
|
0a0a80ec72 | ||
|
|
91a5390929 | ||
|
|
1ea7f6f033 | ||
|
|
947afd7eac | ||
|
|
e6b8b23a2c | ||
|
|
39f1da08d2 | ||
|
|
09a841f810 | ||
|
|
792cff5abc | ||
|
|
f49525ee95 | ||
|
|
4e080928a8 | ||
|
|
9c52fc8e22 | ||
|
|
019ab50994 | ||
|
|
b63868327f | ||
|
|
e080c663a8 | ||
|
|
3a7808dc7d | ||
|
|
49ad633530 | ||
|
|
e03ef41829 | ||
|
|
a4341f22a2 | ||
|
|
b8b0243d95 | ||
|
|
62801f5343 |
21
.gitignore
vendored
21
.gitignore
vendored
@@ -48,13 +48,26 @@ Debug
|
|||||||
*.cpage.col
|
*.cpage.col
|
||||||
*.cpage
|
*.cpage
|
||||||
*.Rproj
|
*.Rproj
|
||||||
xgboost
|
./xgboost
|
||||||
xgboost.mpi
|
./xgboost.mpi
|
||||||
xgboost.mock
|
./xgboost.mock
|
||||||
train*
|
|
||||||
rabit
|
rabit
|
||||||
#.Rbuildignore
|
#.Rbuildignore
|
||||||
R-package.Rproj
|
R-package.Rproj
|
||||||
*.cache*
|
*.cache*
|
||||||
R-package/inst
|
R-package/inst
|
||||||
R-package/src
|
R-package/src
|
||||||
|
#java
|
||||||
|
java/xgboost4j/target
|
||||||
|
java/xgboost4j/tmp
|
||||||
|
java/xgboost4j-demo/target
|
||||||
|
java/xgboost4j-demo/data/
|
||||||
|
java/xgboost4j-demo/tmp/
|
||||||
|
java/xgboost4j-demo/model/
|
||||||
|
nb-configuration*
|
||||||
|
dmlc-core
|
||||||
|
# Eclipse
|
||||||
|
.project
|
||||||
|
.cproject
|
||||||
|
.pydevproject
|
||||||
|
.settings/
|
||||||
|
|||||||
58
.travis.yml
Normal file
58
.travis.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
sudo: true
|
||||||
|
|
||||||
|
# Enabling test on Linux and OS X
|
||||||
|
os:
|
||||||
|
- linux
|
||||||
|
- osx
|
||||||
|
|
||||||
|
# Use Build Matrix to do lint and build seperately
|
||||||
|
env:
|
||||||
|
matrix:
|
||||||
|
- TASK=lint LINT_LANG=cpp
|
||||||
|
- TASK=lint LINT_LANG=python
|
||||||
|
- TASK=R-package CXX=g++
|
||||||
|
- TASK=python-package CXX=g++
|
||||||
|
- TASK=python-package3 CXX=g++
|
||||||
|
- TASK=java-package CXX=g++
|
||||||
|
- TASK=build CXX=g++
|
||||||
|
- TASK=build-with-dmlc CXX=g++
|
||||||
|
|
||||||
|
os:
|
||||||
|
- linux
|
||||||
|
- osx
|
||||||
|
|
||||||
|
# dependent apt packages
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
packages:
|
||||||
|
- doxygen
|
||||||
|
- libopenmpi-dev
|
||||||
|
- wget
|
||||||
|
- libcurl4-openssl-dev
|
||||||
|
- unzip
|
||||||
|
- python-numpy
|
||||||
|
- python-scipy
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- scripts/travis_osx_install.sh
|
||||||
|
- git clone https://github.com/dmlc/dmlc-core
|
||||||
|
- export TRAVIS=dmlc-core/scripts/travis/
|
||||||
|
- export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package
|
||||||
|
- source ${TRAVIS}/travis_setup_env.sh
|
||||||
|
|
||||||
|
install:
|
||||||
|
- pip install cpplint pylint --user `whoami`
|
||||||
|
|
||||||
|
|
||||||
|
script: scripts/travis_script.sh
|
||||||
|
|
||||||
|
|
||||||
|
after_failure:
|
||||||
|
- scripts/travis_after_failure.sh
|
||||||
|
|
||||||
|
|
||||||
|
notifications:
|
||||||
|
email:
|
||||||
|
on_success: change
|
||||||
|
on_failure: always
|
||||||
|
|
||||||
35
CHANGES.md
35
CHANGES.md
@@ -1,18 +1,18 @@
|
|||||||
Change Log
|
Change Log
|
||||||
=====
|
==========
|
||||||
|
|
||||||
xgboost-0.1
|
xgboost-0.1
|
||||||
=====
|
-----------
|
||||||
* Initial release
|
* Initial release
|
||||||
|
|
||||||
xgboost-0.2x
|
xgboost-0.2x
|
||||||
=====
|
------------
|
||||||
* Python module
|
* Python module
|
||||||
* Weighted samples instances
|
* Weighted samples instances
|
||||||
* Initial version of pairwise rank
|
* Initial version of pairwise rank
|
||||||
|
|
||||||
xgboost-0.3
|
xgboost-0.3
|
||||||
=====
|
-----------
|
||||||
* Faster tree construction module
|
* Faster tree construction module
|
||||||
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
|
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
|
||||||
* Support for boosting from initial predictions
|
* Support for boosting from initial predictions
|
||||||
@@ -22,7 +22,7 @@ xgboost-0.3
|
|||||||
* Add R module
|
* Add R module
|
||||||
|
|
||||||
xgboost-0.4
|
xgboost-0.4
|
||||||
=====
|
-----------
|
||||||
* Distributed version of xgboost that runs on YARN, scales to billions of examples
|
* Distributed version of xgboost that runs on YARN, scales to billions of examples
|
||||||
* Direct save/load data and model from/to S3 and HDFS
|
* Direct save/load data and model from/to S3 and HDFS
|
||||||
* Feature importance visualization in R module, by Michael Benesty
|
* Feature importance visualization in R module, by Michael Benesty
|
||||||
@@ -34,3 +34,28 @@ xgboost-0.4
|
|||||||
- xgboost python model is now pickable
|
- xgboost python model is now pickable
|
||||||
* sklearn wrapper is supported in python module
|
* sklearn wrapper is supported in python module
|
||||||
* Experimental External memory version
|
* Experimental External memory version
|
||||||
|
|
||||||
|
xgboost-0.47
|
||||||
|
------------
|
||||||
|
* Changes in R library
|
||||||
|
- fixed possible problem of poisson regression.
|
||||||
|
- switched from 0 to NA for missing values.
|
||||||
|
- exposed access to additional model parameters.
|
||||||
|
* Changes in Python library
|
||||||
|
- throws exception instead of crash terminal when a parameter error happens.
|
||||||
|
- has importance plot and tree plot functions.
|
||||||
|
- accepts different learning rates for each boosting round.
|
||||||
|
- allows model training continuation from previously saved model.
|
||||||
|
- allows early stopping in CV.
|
||||||
|
- allows feval to return a list of tuples.
|
||||||
|
- allows eval_metric to handle additional format.
|
||||||
|
- improved compatibility in sklearn module.
|
||||||
|
- additional parameters added for sklearn wrapper.
|
||||||
|
- added pip installation functionality.
|
||||||
|
- supports more Pandas DataFrame dtypes.
|
||||||
|
- added best_ntree_limit attribute, in addition to best_score and best_iteration.
|
||||||
|
* Java api is ready for use
|
||||||
|
* Added more test cases and continuous integration to make each build more robust.
|
||||||
|
|
||||||
|
on going at master
|
||||||
|
------------------
|
||||||
|
|||||||
61
CONTRIBUTORS.md
Normal file
61
CONTRIBUTORS.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
Contributors of DMLC/XGBoost
|
||||||
|
============================
|
||||||
|
XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.
|
||||||
|
|
||||||
|
Comitters
|
||||||
|
---------
|
||||||
|
Committers are people who have made substantial contribution to the project and granted write access to the project.
|
||||||
|
* [Tianqi Chen](https://github.com/tqchen), University of Washington
|
||||||
|
- Tianqi is a PhD working on large-scale machine learning, he is the creator of the project.
|
||||||
|
* [Tong He](https://github.com/hetong007), Simon Fraser University
|
||||||
|
- Tong is a master student working on data mining, he is the maintainer of xgboost R package.
|
||||||
|
* [Bing Xu](https://github.com/antinucleon)
|
||||||
|
- Bing is the original creator of xgboost python package and currently the maintainer of [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl).
|
||||||
|
* [Michael Benesty](https://github.com/pommedeterresautee)
|
||||||
|
- Micheal is a lawyer, data scientist in France, he is the creator of xgboost interactive analysis module in R.
|
||||||
|
* [Yuan Tang](https://github.com/terrytangyuan)
|
||||||
|
- Yuan is a data scientist in Chicago, US. He contributed mostly in R and Python packages.
|
||||||
|
|
||||||
|
Become a Comitter
|
||||||
|
-----------------
|
||||||
|
XGBoost is a opensource project and we are actively looking for new comitters who are willing to help maintaining and lead the project.
|
||||||
|
Committers comes from contributors who:
|
||||||
|
* Made substantial contribution to the project.
|
||||||
|
* Willing to spent time on maintaining and lead the project.
|
||||||
|
|
||||||
|
New committers will be proposed by current comitter memembers, with support from more than two of current comitters.
|
||||||
|
|
||||||
|
List of Contributors
|
||||||
|
--------------------
|
||||||
|
* [Full List of Contributors](https://github.com/dmlc/xgboost/graphs/contributors)
|
||||||
|
- To contributors: please add your name to the list when you submit a patch to the project:)
|
||||||
|
* [Kailong Chen](https://github.com/kalenhaha)
|
||||||
|
- Kailong is an early contributor of xgboost, he is creator of ranking objectives in xgboost.
|
||||||
|
* [Skipper Seabold](https://github.com/jseabold)
|
||||||
|
- Skipper is the major contributor to the scikit-learn module of xgboost.
|
||||||
|
* [Zygmunt Zając](https://github.com/zygmuntz)
|
||||||
|
- Zygmunt is the master behind the early stopping feature frequently used by kagglers.
|
||||||
|
* [Ajinkya Kale](https://github.com/ajkl)
|
||||||
|
* [Boliang Chen](https://github.com/cblsjtu)
|
||||||
|
* [Vadim Khotilovich](https://github.com/khotilov)
|
||||||
|
* [Yangqing Men](https://github.com/yanqingmen)
|
||||||
|
- Yangqing is the creator of xgboost java package.
|
||||||
|
* [Engpeng Yao](https://github.com/yepyao)
|
||||||
|
* [Giulio](https://github.com/giuliohome)
|
||||||
|
- Giulio is the creator of windows project of xgboost
|
||||||
|
* [Jamie Hall](https://github.com/nerdcha)
|
||||||
|
- Jamie is the initial creator of xgboost sklearn modue.
|
||||||
|
* [Yen-Ying Lee](https://github.com/white1033)
|
||||||
|
* [Masaaki Horikoshi](https://github.com/sinhrks)
|
||||||
|
- Masaaki is the initial creator of xgboost python plotting module.
|
||||||
|
* [Hongliang Liu](https://github.com/phunterlau)
|
||||||
|
- Hongliang is the maintainer of xgboost python PyPI package for pip installation.
|
||||||
|
* [daiyl0320](https://github.com/daiyl0320)
|
||||||
|
- daiyl0320 contributed patch to xgboost distributed version more robust, and scales stably on TB scale datasets.
|
||||||
|
* [Huayi Zhang](https://github.com/irachex)
|
||||||
|
* [Johan Manders](https://github.com/johanmanders)
|
||||||
|
* [yoori](https://github.com/yoori)
|
||||||
|
* [Mathias Müller](https://github.com/far0n)
|
||||||
|
* [Sam Thomson](https://github.com/sammthomson)
|
||||||
|
* [ganesh-krishnan](https://github.com/ganesh-krishnan)
|
||||||
|
* [Damien Carol](https://github.com/damiencarol)
|
||||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
|||||||
Copyright (c) 2014 by Tianqi Chen and Contributors
|
Copyright (c) 2014 by Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|||||||
101
Makefile
101
Makefile
@@ -1,24 +1,45 @@
|
|||||||
export CC = gcc
|
export CC = $(if $(shell which gcc-5 2>/dev/null),gcc-5,gcc)
|
||||||
export CXX = g++
|
export CXX = $(if $(shell which g++-5 2>/dev/null),g++-5,g++)
|
||||||
|
|
||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
export LDFLAGS= -pthread -lm
|
export LDFLAGS= -pthread -lm
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops
|
||||||
|
# java include path
|
||||||
|
export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
|
||||||
|
|
||||||
ifeq ($(OS), Windows_NT)
|
ifeq ($(OS), Windows_NT)
|
||||||
export CXX = g++ -m64
|
export CXX = g++ -m64
|
||||||
export CC = gcc -m64
|
export CC = gcc -m64
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
UNAME= $(shell uname)
|
||||||
|
|
||||||
|
ifeq ($(UNAME), Linux)
|
||||||
|
LDFLAGS += -lrt
|
||||||
|
JAVAINCFLAGS += -I${JAVA_HOME}/include/linux
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(UNAME), Darwin)
|
||||||
|
JAVAINCFLAGS += -I${JAVA_HOME}/include/darwin
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(no_omp),1)
|
ifeq ($(no_omp),1)
|
||||||
CFLAGS += -DDISABLE_OPENMP
|
CFLAGS += -DDISABLE_OPENMP
|
||||||
else
|
else
|
||||||
CFLAGS += -fopenmp
|
#CFLAGS += -fopenmp
|
||||||
|
ifeq ($(omp_mac_static),1)
|
||||||
|
#CFLAGS += -fopenmp -Bstatic
|
||||||
|
CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp
|
||||||
|
#LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive
|
||||||
|
else
|
||||||
|
CFLAGS += -fopenmp
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
# by default use c++11
|
# by default use c++11
|
||||||
ifeq ($(cxx11),1)
|
ifeq ($(cxx11),1)
|
||||||
CFLAGS += -std=c++11
|
CFLAGS += -std=c++11
|
||||||
else
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# handling dmlc
|
# handling dmlc
|
||||||
@@ -38,6 +59,14 @@ else
|
|||||||
LIBDMLC=dmlc_simple.o
|
LIBDMLC=dmlc_simple.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef WITH_FPIC
|
||||||
|
WITH_FPIC = 1
|
||||||
|
endif
|
||||||
|
ifeq ($(WITH_FPIC), 1)
|
||||||
|
CFLAGS += -fPIC
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
ifeq ($(OS), Windows_NT)
|
ifeq ($(OS), Windows_NT)
|
||||||
LIBRABIT = subtree/rabit/lib/librabit_empty.a
|
LIBRABIT = subtree/rabit/lib/librabit_empty.a
|
||||||
SLIB = wrapper/xgboost_wrapper.dll
|
SLIB = wrapper/xgboost_wrapper.dll
|
||||||
@@ -46,16 +75,27 @@ else
|
|||||||
SLIB = wrapper/libxgboostwrapper.so
|
SLIB = wrapper/libxgboostwrapper.so
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# java lib
|
||||||
|
JLIB = java/libxgboost4j.so
|
||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
BIN = xgboost
|
BIN = xgboost
|
||||||
MOCKBIN = xgboost.mock
|
MOCKBIN = xgboost.mock
|
||||||
OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
|
OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
|
||||||
MPIBIN =
|
MPIBIN =
|
||||||
TARGET = $(BIN) $(OBJ) $(SLIB)
|
ifeq ($(WITH_FPIC), 1)
|
||||||
|
TARGET = $(BIN) $(OBJ) $(SLIB)
|
||||||
|
else
|
||||||
|
TARGET = $(BIN)
|
||||||
|
endif
|
||||||
|
|
||||||
.PHONY: clean all mpi python Rpack
|
ifndef LINT_LANG
|
||||||
|
LINT_LANG= "all"
|
||||||
|
endif
|
||||||
|
|
||||||
all: $(BIN) $(OBJ) $(SLIB)
|
.PHONY: clean all mpi python Rpack lint
|
||||||
|
|
||||||
|
all: $(TARGET)
|
||||||
mpi: $(MPIBIN)
|
mpi: $(MPIBIN)
|
||||||
|
|
||||||
python: wrapper/libxgboostwrapper.so
|
python: wrapper/libxgboostwrapper.so
|
||||||
@@ -68,6 +108,9 @@ main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner
|
|||||||
xgboost: updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC)
|
xgboost: updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC)
|
||||||
wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
|
wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
|
||||||
|
|
||||||
|
java: java/libxgboost4j.so
|
||||||
|
java/libxgboost4j.so: java/xgboost4j_wrapper.cpp wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
|
||||||
|
|
||||||
# dependency on rabit
|
# dependency on rabit
|
||||||
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
|
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
|
||||||
+ cd subtree/rabit;make lib/librabit.a; cd ../..
|
+ cd subtree/rabit;make lib/librabit.a; cd ../..
|
||||||
@@ -79,7 +122,7 @@ subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
|
|||||||
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
|
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
$(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
||||||
|
|
||||||
$(MOCKBIN) :
|
$(MOCKBIN) :
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
||||||
@@ -87,6 +130,9 @@ $(MOCKBIN) :
|
|||||||
$(SLIB) :
|
$(SLIB) :
|
||||||
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS)
|
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS)
|
||||||
|
|
||||||
|
$(JLIB) :
|
||||||
|
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(JAVAINCFLAGS)
|
||||||
|
|
||||||
$(OBJ) :
|
$(OBJ) :
|
||||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||||
|
|
||||||
@@ -122,10 +168,47 @@ Rpack:
|
|||||||
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
|
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
|
||||||
cp xgboost/src/Makevars xgboost/src/Makevars.win
|
cp xgboost/src/Makevars xgboost/src/Makevars.win
|
||||||
# R CMD build --no-build-vignettes xgboost
|
# R CMD build --no-build-vignettes xgboost
|
||||||
|
# R CMD build xgboost
|
||||||
|
# rm -rf xgboost
|
||||||
|
# R CMD check --as-cran xgboost*.tar.gz
|
||||||
|
|
||||||
|
Rbuild:
|
||||||
|
make Rpack
|
||||||
R CMD build xgboost
|
R CMD build xgboost
|
||||||
rm -rf xgboost
|
rm -rf xgboost
|
||||||
|
|
||||||
|
Rcheck:
|
||||||
|
make Rbuild
|
||||||
R CMD check --as-cran xgboost*.tar.gz
|
R CMD check --as-cran xgboost*.tar.gz
|
||||||
|
|
||||||
|
pythonpack:
|
||||||
|
#for pip maintainer only
|
||||||
|
cd subtree/rabit;make clean;cd ..
|
||||||
|
rm -rf xgboost-deploy xgboost*.tar.gz
|
||||||
|
cp -r python-package xgboost-deploy
|
||||||
|
#cp *.md xgboost-deploy/
|
||||||
|
cp LICENSE xgboost-deploy/
|
||||||
|
cp Makefile xgboost-deploy/xgboost
|
||||||
|
cp -r wrapper xgboost-deploy/xgboost
|
||||||
|
cp -r subtree xgboost-deploy/xgboost
|
||||||
|
cp -r multi-node xgboost-deploy/xgboost
|
||||||
|
cp -r windows xgboost-deploy/xgboost
|
||||||
|
cp -r src xgboost-deploy/xgboost
|
||||||
|
cp python-package/setup_pip.py xgboost-deploy/setup.py
|
||||||
|
#make python
|
||||||
|
|
||||||
|
pythonbuild:
|
||||||
|
make pythonpack
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
pythoncheck:
|
||||||
|
make pythonbuild
|
||||||
|
python -c 'import xgboost;print xgboost.core.find_lib_path()'
|
||||||
|
|
||||||
|
# lint requires dmlc to be in current folder
|
||||||
|
lint:
|
||||||
|
dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||||
cd subtree/rabit; make clean; cd ..
|
cd subtree/rabit; make clean; cd ..
|
||||||
|
|||||||
@@ -3,3 +3,4 @@
|
|||||||
\.dll$
|
\.dll$
|
||||||
^.*\.Rproj$
|
^.*\.Rproj$
|
||||||
^\.Rproj\.user$
|
^\.Rproj\.user$
|
||||||
|
README.md
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
Package: xgboost
|
Package: xgboost
|
||||||
Type: Package
|
Type: Package
|
||||||
Title: eXtreme Gradient Boosting
|
Title: Extreme Gradient Boosting
|
||||||
Version: 0.4-0
|
Version: 0.4-2
|
||||||
Date: 2015-05-11
|
Date: 2015-08-01
|
||||||
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
|
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
|
||||||
|
Michael Benesty <michael@benesty.fr>
|
||||||
Maintainer: Tong He <hetong007@gmail.com>
|
Maintainer: Tong He <hetong007@gmail.com>
|
||||||
Description: Xgboost is short for eXtreme Gradient Boosting, which is an
|
Description: Extreme Gradient Boosting, which is an efficient implementation
|
||||||
efficient and scalable implementation of gradient boosting framework.
|
of gradient boosting framework. This package is its R interface. The package
|
||||||
This package is an R wrapper of xgboost. The package includes efficient
|
includes efficient linear model solver and tree learning algorithms. The package
|
||||||
linear model solver and tree learning algorithms. The package can automatically
|
can automatically do parallel computation on a single machine which could be
|
||||||
do parallel computation with OpenMP, and it can be more than 10 times faster
|
more than 10 times faster than existing gradient boosting packages. It supports
|
||||||
than existing gradient boosting packages such as gbm. It supports various
|
various objective functions, including regression, classification and ranking.
|
||||||
objective functions, including regression, classification and ranking. The
|
The package is made to be extensible, so that users are also allowed to define
|
||||||
package is made to be extensible, so that users are also allowed to define
|
|
||||||
their own objectives easily.
|
their own objectives easily.
|
||||||
License: Apache License (== 2.0) | file LICENSE
|
License: Apache License (== 2.0) | file LICENSE
|
||||||
URL: https://github.com/dmlc/xgboost
|
URL: https://github.com/dmlc/xgboost
|
||||||
@@ -20,15 +20,18 @@ BugReports: https://github.com/dmlc/xgboost/issues
|
|||||||
VignetteBuilder: knitr
|
VignetteBuilder: knitr
|
||||||
Suggests:
|
Suggests:
|
||||||
knitr,
|
knitr,
|
||||||
ggplot2 (>= 1.0.0),
|
ggplot2 (>= 1.0.1),
|
||||||
DiagrammeR (>= 0.6),
|
DiagrammeR (>= 0.8.1),
|
||||||
Ckmeans.1d.dp (>= 3.3.1),
|
Ckmeans.1d.dp (>= 3.3.1),
|
||||||
vcd (>= 1.3)
|
vcd (>= 1.3),
|
||||||
|
testthat,
|
||||||
|
igraph (>= 1.0.1)
|
||||||
Depends:
|
Depends:
|
||||||
R (>= 2.10)
|
R (>= 2.10)
|
||||||
Imports:
|
Imports:
|
||||||
Matrix (>= 1.1-0),
|
Matrix (>= 1.1-0),
|
||||||
methods,
|
methods,
|
||||||
data.table (>= 1.9.4),
|
data.table (>= 1.9.6),
|
||||||
magrittr (>= 1.5),
|
magrittr (>= 1.5),
|
||||||
stringr (>= 0.6.2)
|
stringr (>= 0.6.2)
|
||||||
|
RoxygenNote: 5.0.1
|
||||||
|
|||||||
@@ -1,16 +1,19 @@
|
|||||||
# Generated by roxygen2 (4.1.1): do not edit by hand
|
# Generated by roxygen2: do not edit by hand
|
||||||
|
|
||||||
export(getinfo)
|
export(getinfo)
|
||||||
export(setinfo)
|
export(setinfo)
|
||||||
export(slice)
|
export(slice)
|
||||||
export(xgb.DMatrix)
|
export(xgb.DMatrix)
|
||||||
export(xgb.DMatrix.save)
|
export(xgb.DMatrix.save)
|
||||||
|
export(xgb.create.features)
|
||||||
export(xgb.cv)
|
export(xgb.cv)
|
||||||
export(xgb.dump)
|
export(xgb.dump)
|
||||||
export(xgb.importance)
|
export(xgb.importance)
|
||||||
export(xgb.load)
|
export(xgb.load)
|
||||||
export(xgb.model.dt.tree)
|
export(xgb.model.dt.tree)
|
||||||
|
export(xgb.plot.deepness)
|
||||||
export(xgb.plot.importance)
|
export(xgb.plot.importance)
|
||||||
|
export(xgb.plot.multi.trees)
|
||||||
export(xgb.plot.tree)
|
export(xgb.plot.tree)
|
||||||
export(xgb.save)
|
export(xgb.save)
|
||||||
export(xgb.save.raw)
|
export(xgb.save.raw)
|
||||||
@@ -23,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix)
|
|||||||
importClassesFrom(Matrix,dgeMatrix)
|
importClassesFrom(Matrix,dgeMatrix)
|
||||||
importFrom(Matrix,cBind)
|
importFrom(Matrix,cBind)
|
||||||
importFrom(Matrix,colSums)
|
importFrom(Matrix,colSums)
|
||||||
|
importFrom(Matrix,sparse.model.matrix)
|
||||||
importFrom(Matrix,sparseVector)
|
importFrom(Matrix,sparseVector)
|
||||||
importFrom(data.table,":=")
|
importFrom(data.table,":=")
|
||||||
importFrom(data.table,as.data.table)
|
importFrom(data.table,as.data.table)
|
||||||
@@ -35,6 +39,7 @@ importFrom(data.table,setnames)
|
|||||||
importFrom(magrittr,"%>%")
|
importFrom(magrittr,"%>%")
|
||||||
importFrom(magrittr,add)
|
importFrom(magrittr,add)
|
||||||
importFrom(magrittr,not)
|
importFrom(magrittr,not)
|
||||||
|
importFrom(stringr,str_detect)
|
||||||
importFrom(stringr,str_extract)
|
importFrom(stringr,str_extract)
|
||||||
importFrom(stringr,str_extract_all)
|
importFrom(stringr,str_extract_all)
|
||||||
importFrom(stringr,str_match)
|
importFrom(stringr,str_match)
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ setClass('xgb.DMatrix')
|
|||||||
#' stopifnot(all(labels2 == 1-labels))
|
#' stopifnot(all(labels2 == 1-labels))
|
||||||
#' @rdname getinfo
|
#' @rdname getinfo
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
getinfo <- function(object, ...){
|
getinfo <- function(object, ...){
|
||||||
UseMethod("getinfo")
|
UseMethod("getinfo")
|
||||||
}
|
}
|
||||||
@@ -54,4 +53,3 @@ setMethod("getinfo", signature = "xgb.DMatrix",
|
|||||||
}
|
}
|
||||||
return(ret)
|
return(ret)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,17 @@ setClass("xgb.Booster",
|
|||||||
#' only valid for gbtree, but not for gblinear. set it to be value bigger
|
#' only valid for gbtree, but not for gblinear. set it to be value bigger
|
||||||
#' than 0. It will use all trees by default.
|
#' than 0. It will use all trees by default.
|
||||||
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
|
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' The option \code{ntreelimit} purpose is to let the user train a model with lots
|
||||||
|
#' of trees but use only the first trees for prediction to avoid overfitting
|
||||||
|
#' (without having to train a new model with less trees).
|
||||||
|
#'
|
||||||
|
#' The option \code{predleaf} purpose is inspired from §3.1 of the paper
|
||||||
|
#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
|
||||||
|
#' The idea is to use the model as a generator of new features which capture non linear link
|
||||||
|
#' from original features.
|
||||||
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#' data(agaricus.test, package='xgboost')
|
#' data(agaricus.test, package='xgboost')
|
||||||
@@ -29,9 +40,8 @@ setClass("xgb.Booster",
|
|||||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
#' pred <- predict(bst, test$data)
|
#' pred <- predict(bst, test$data)
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
setMethod("predict", signature = "xgb.Booster",
|
setMethod("predict", signature = "xgb.Booster",
|
||||||
definition = function(object, newdata, missing = NULL,
|
definition = function(object, newdata, missing = NA,
|
||||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
|
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
|
||||||
if (class(object) != "xgb.Booster"){
|
if (class(object) != "xgb.Booster"){
|
||||||
stop("predict: model in prediction must be of class xgb.Booster")
|
stop("predict: model in prediction must be of class xgb.Booster")
|
||||||
@@ -39,11 +49,7 @@ setMethod("predict", signature = "xgb.Booster",
|
|||||||
object <- xgb.Booster.check(object, saveraw = FALSE)
|
object <- xgb.Booster.check(object, saveraw = FALSE)
|
||||||
}
|
}
|
||||||
if (class(newdata) != "xgb.DMatrix") {
|
if (class(newdata) != "xgb.DMatrix") {
|
||||||
if (is.null(missing)) {
|
newdata <- xgb.DMatrix(newdata, missing = missing)
|
||||||
newdata <- xgb.DMatrix(newdata)
|
|
||||||
} else {
|
|
||||||
newdata <- xgb.DMatrix(newdata, missing = missing)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (is.null(ntreelimit)) {
|
if (is.null(ntreelimit)) {
|
||||||
ntreelimit <- 0
|
ntreelimit <- 0
|
||||||
@@ -52,7 +58,7 @@ setMethod("predict", signature = "xgb.Booster",
|
|||||||
stop("predict: ntreelimit must be equal to or greater than 1")
|
stop("predict: ntreelimit must be equal to or greater than 1")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
option = 0
|
option <- 0
|
||||||
if (outputmargin) {
|
if (outputmargin) {
|
||||||
option <- option + 1
|
option <- option + 1
|
||||||
}
|
}
|
||||||
@@ -72,4 +78,3 @@ setMethod("predict", signature = "xgb.Booster",
|
|||||||
}
|
}
|
||||||
return(ret)
|
return(ret)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ setMethod("predict", signature = "xgb.Booster.handle",
|
|||||||
|
|
||||||
bst <- xgb.handleToBooster(object)
|
bst <- xgb.handleToBooster(object)
|
||||||
|
|
||||||
ret = predict(bst, ...)
|
ret <- predict(bst, ...)
|
||||||
return(ret)
|
return(ret)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,6 @@
|
|||||||
#' stopifnot(all(labels2 == 1-labels))
|
#' stopifnot(all(labels2 == 1-labels))
|
||||||
#' @rdname setinfo
|
#' @rdname setinfo
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
setinfo <- function(object, ...){
|
setinfo <- function(object, ...){
|
||||||
UseMethod("setinfo")
|
UseMethod("setinfo")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ setClass('xgb.DMatrix')
|
|||||||
#' dsub <- slice(dtrain, 1:3)
|
#' dsub <- slice(dtrain, 1:3)
|
||||||
#' @rdname slice
|
#' @rdname slice
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
slice <- function(object, ...){
|
slice <- function(object, ...){
|
||||||
UseMethod("slice")
|
UseMethod("slice")
|
||||||
}
|
}
|
||||||
@@ -34,8 +33,8 @@ setMethod("slice", signature = "xgb.DMatrix",
|
|||||||
attr_list <- attributes(object)
|
attr_list <- attributes(object)
|
||||||
nr <- xgb.numrow(object)
|
nr <- xgb.numrow(object)
|
||||||
len <- sapply(attr_list,length)
|
len <- sapply(attr_list,length)
|
||||||
ind <- which(len==nr)
|
ind <- which(len == nr)
|
||||||
if (length(ind)>0) {
|
if (length(ind) > 0) {
|
||||||
nms <- names(attr_list)[ind]
|
nms <- names(attr_list)[ind]
|
||||||
for (i in 1:length(ind)) {
|
for (i in 1:length(ind)) {
|
||||||
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
|
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
|
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
|
||||||
#' @import methods
|
#' @import methods
|
||||||
|
|
||||||
# depends on matrix
|
# depends on matrix
|
||||||
@@ -15,14 +15,14 @@ xgb.setinfo <- function(dmat, name, info) {
|
|||||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
||||||
}
|
}
|
||||||
if (name == "label") {
|
if (name == "label") {
|
||||||
if (length(info)!=xgb.numrow(dmat))
|
if (length(info) != xgb.numrow(dmat))
|
||||||
stop("The length of labels must equal to the number of rows in the input data")
|
stop("The length of labels must equal to the number of rows in the input data")
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
if (name == "weight") {
|
if (name == "weight") {
|
||||||
if (length(info)!=xgb.numrow(dmat))
|
if (length(info) != xgb.numrow(dmat))
|
||||||
stop("The length of weights must equal to the number of rows in the input data")
|
stop("The length of weights must equal to the number of rows in the input data")
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
@@ -36,7 +36,7 @@ xgb.setinfo <- function(dmat, name, info) {
|
|||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
if (name == "group") {
|
if (name == "group") {
|
||||||
if (sum(info)!=xgb.numrow(dmat))
|
if (sum(info) != xgb.numrow(dmat))
|
||||||
stop("The sum of groups must equal to the number of rows in the input data")
|
stop("The sum of groups must equal to the number of rows in the input data")
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
|
||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
@@ -103,16 +103,15 @@ xgb.Booster.check <- function(bst, saveraw = TRUE)
|
|||||||
## ----the following are low level iteratively function, not needed if
|
## ----the following are low level iteratively function, not needed if
|
||||||
## you do not want to use them ---------------------------------------
|
## you do not want to use them ---------------------------------------
|
||||||
# get dmatrix from data, label
|
# get dmatrix from data, label
|
||||||
xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
|
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
|
||||||
inClass <- class(data)
|
inClass <- class(data)
|
||||||
if (inClass == "dgCMatrix" || inClass == "matrix") {
|
if (inClass == "dgCMatrix" || inClass == "matrix") {
|
||||||
if (is.null(label)) {
|
if (is.null(label)) {
|
||||||
stop("xgboost: need label when data is a matrix")
|
stop("xgboost: need label when data is a matrix")
|
||||||
}
|
}
|
||||||
if (is.null(missing)){
|
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
|
||||||
dtrain <- xgb.DMatrix(data, label = label)
|
if (!is.null(weight)){
|
||||||
} else {
|
xgb.setinfo(dtrain, "weight", weight)
|
||||||
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!is.null(label)) {
|
if (!is.null(label)) {
|
||||||
@@ -122,6 +121,9 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
|
|||||||
dtrain <- xgb.DMatrix(data)
|
dtrain <- xgb.DMatrix(data)
|
||||||
} else if (inClass == "xgb.DMatrix") {
|
} else if (inClass == "xgb.DMatrix") {
|
||||||
dtrain <- data
|
dtrain <- data
|
||||||
|
} else if (inClass == "data.frame") {
|
||||||
|
stop("xgboost only support numerical matrix input,
|
||||||
|
use 'data.matrix' to transform the data.")
|
||||||
} else {
|
} else {
|
||||||
stop("xgboost: Invalid input of data")
|
stop("xgboost: Invalid input of data")
|
||||||
}
|
}
|
||||||
@@ -140,8 +142,7 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
|
|||||||
if (class(dtrain) != "xgb.DMatrix") {
|
if (class(dtrain) != "xgb.DMatrix") {
|
||||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||||
}
|
}
|
||||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess,
|
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
|
||||||
PACKAGE = "xgboost")
|
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -157,7 +158,7 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
|
|||||||
if (is.null(obj)) {
|
if (is.null(obj)) {
|
||||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
|
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
|
||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
} else {
|
} else {
|
||||||
pred <- predict(booster, dtrain)
|
pred <- predict(booster, dtrain)
|
||||||
gpair <- obj(pred, dtrain)
|
gpair <- obj(pred, dtrain)
|
||||||
succ <- xgb.iter.boost(booster, dtrain, gpair)
|
succ <- xgb.iter.boost(booster, dtrain, gpair)
|
||||||
@@ -220,7 +221,8 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
|
|||||||
stop("nfold must be bigger than 1")
|
stop("nfold must be bigger than 1")
|
||||||
}
|
}
|
||||||
if(is.null(folds)) {
|
if(is.null(folds)) {
|
||||||
if (exists('objective', where=param) && strtrim(param[['objective']], 5) == 'rank:') {
|
if (exists('objective', where=param) && is.character(param$objective) &&
|
||||||
|
strtrim(param[['objective']], 5) == 'rank:') {
|
||||||
stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n",
|
stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n",
|
||||||
"\tConsider providing pre-computed CV-folds through the folds parameter.")
|
"\tConsider providing pre-computed CV-folds through the folds parameter.")
|
||||||
}
|
}
|
||||||
@@ -234,7 +236,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
|
|||||||
# For classification, need to convert y labels to factor before making the folds,
|
# For classification, need to convert y labels to factor before making the folds,
|
||||||
# and then do stratification by factor levels.
|
# and then do stratification by factor levels.
|
||||||
# For regression, leave y numeric and do stratification by quantiles.
|
# For regression, leave y numeric and do stratification by quantiles.
|
||||||
if (exists('objective', where=param)) {
|
if (exists('objective', where=param) && is.character(param$objective)) {
|
||||||
# If 'objective' provided in params, assume that y is a classification label
|
# If 'objective' provided in params, assume that y is a classification label
|
||||||
# unless objective is reg:linear
|
# unless objective is reg:linear
|
||||||
if (param[['objective']] != 'reg:linear') y <- factor(y)
|
if (param[['objective']] != 'reg:linear') y <- factor(y)
|
||||||
@@ -249,17 +251,17 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
|
|||||||
# make simple non-stratified folds
|
# make simple non-stratified folds
|
||||||
kstep <- length(randidx) %/% nfold
|
kstep <- length(randidx) %/% nfold
|
||||||
folds <- list()
|
folds <- list()
|
||||||
for (i in 1:(nfold-1)) {
|
for (i in 1:(nfold - 1)) {
|
||||||
folds[[i]] = randidx[1:kstep]
|
folds[[i]] <- randidx[1:kstep]
|
||||||
randidx = setdiff(randidx, folds[[i]])
|
randidx <- setdiff(randidx, folds[[i]])
|
||||||
}
|
}
|
||||||
folds[[nfold]] = randidx
|
folds[[nfold]] <- randidx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ret <- list()
|
ret <- list()
|
||||||
for (k in 1:nfold) {
|
for (k in 1:nfold) {
|
||||||
dtest <- slice(dall, folds[[k]])
|
dtest <- slice(dall, folds[[k]])
|
||||||
didx = c()
|
didx <- c()
|
||||||
for (i in 1:nfold) {
|
for (i in 1:nfold) {
|
||||||
if (i != k) {
|
if (i != k) {
|
||||||
didx <- append(didx, folds[[i]])
|
didx <- append(didx, folds[[i]])
|
||||||
@@ -267,7 +269,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
|
|||||||
}
|
}
|
||||||
dtrain <- slice(dall, didx)
|
dtrain <- slice(dall, didx)
|
||||||
bst <- xgb.Booster(param, list(dtrain, dtest))
|
bst <- xgb.Booster(param, list(dtrain, dtest))
|
||||||
watchlist = list(train=dtrain, test=dtest)
|
watchlist <- list(train=dtrain, test=dtest)
|
||||||
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
|
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
|
||||||
}
|
}
|
||||||
return (ret)
|
return (ret)
|
||||||
@@ -287,7 +289,7 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
|
|||||||
}
|
}
|
||||||
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
|
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
|
||||||
if (showsd) {
|
if (showsd) {
|
||||||
ret <- paste(ret, sprintf("+%f", sd(stats)), sep="")
|
ret <- paste(ret, sprintf("+%f", stats::sd(stats)), sep="")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (ret)
|
return (ret)
|
||||||
@@ -308,11 +310,11 @@ xgb.createFolds <- function(y, k = 10)
|
|||||||
## At most, we will use quantiles. If the sample
|
## At most, we will use quantiles. If the sample
|
||||||
## is too small, we just do regular unstratified
|
## is too small, we just do regular unstratified
|
||||||
## CV
|
## CV
|
||||||
cuts <- floor(length(y)/k)
|
cuts <- floor(length(y) / k)
|
||||||
if(cuts < 2) cuts <- 2
|
if (cuts < 2) cuts <- 2
|
||||||
if(cuts > 5) cuts <- 5
|
if (cuts > 5) cuts <- 5
|
||||||
y <- cut(y,
|
y <- cut(y,
|
||||||
unique(quantile(y, probs = seq(0, 1, length = cuts))),
|
unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
|
||||||
include.lowest = TRUE)
|
include.lowest = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,8 +17,7 @@
|
|||||||
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
||||||
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
|
||||||
xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {
|
|
||||||
if (typeof(data) == "character") {
|
if (typeof(data) == "character") {
|
||||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
|
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
|
||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
|
|||||||
@@ -12,7 +12,6 @@
|
|||||||
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
||||||
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
xgb.DMatrix.save <- function(DMatrix, fname) {
|
xgb.DMatrix.save <- function(DMatrix, fname) {
|
||||||
if (typeof(fname) != "character") {
|
if (typeof(fname) != "character") {
|
||||||
stop("xgb.save: fname must be character")
|
stop("xgb.save: fname must be character")
|
||||||
|
|||||||
91
R-package/R/xgb.create.features.R
Normal file
91
R-package/R/xgb.create.features.R
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#' Create new features from a previously learned model
|
||||||
|
#'
|
||||||
|
#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
|
||||||
|
#'
|
||||||
|
#' @importFrom magrittr %>%
|
||||||
|
#' @importFrom Matrix cBind
|
||||||
|
#' @importFrom Matrix sparse.model.matrix
|
||||||
|
#'
|
||||||
|
#' @param model decision tree boosting model learned on the original data
|
||||||
|
#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
|
||||||
|
#'
|
||||||
|
#' @return \code{dgCMatrix} matrix including both the original data and the new features.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' This is the function inspired from the paragraph 3.1 of the paper:
|
||||||
|
#'
|
||||||
|
#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
|
||||||
|
#'
|
||||||
|
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
|
||||||
|
#' Joaquin Quiñonero Candela)}
|
||||||
|
#'
|
||||||
|
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
||||||
|
#'
|
||||||
|
#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
|
||||||
|
#'
|
||||||
|
#' Extract explaining the method:
|
||||||
|
#'
|
||||||
|
#' "\emph{We found that boosted decision trees are a powerful and very
|
||||||
|
#' convenient way to implement non-linear and tuple transformations
|
||||||
|
#' of the kind we just described. We treat each individual
|
||||||
|
#' tree as a categorical feature that takes as value the
|
||||||
|
#' index of the leaf an instance ends up falling in. We use
|
||||||
|
#' 1-of-K coding of this type of features.
|
||||||
|
#'
|
||||||
|
#' For example, consider the boosted tree model in Figure 1 with 2 subtrees,
|
||||||
|
#' where the first subtree has 3 leafs and the second 2 leafs. If an
|
||||||
|
#' instance ends up in leaf 2 in the first subtree and leaf 1 in
|
||||||
|
#' second subtree, the overall input to the linear classifier will
|
||||||
|
#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
|
||||||
|
#' correspond to the leaves of the first subtree and last 2 to
|
||||||
|
#' those of the second subtree.
|
||||||
|
#'
|
||||||
|
#' [...]
|
||||||
|
#'
|
||||||
|
#' We can understand boosted decision tree
|
||||||
|
#' based transformation as a supervised feature encoding that
|
||||||
|
#' converts a real-valued vector into a compact binary-valued
|
||||||
|
#' vector. A traversal from root node to a leaf node represents
|
||||||
|
#' a rule on certain features.}"
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(agaricus.train, package='xgboost')
|
||||||
|
#' data(agaricus.test, package='xgboost')
|
||||||
|
#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||||
|
#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||||
|
#'
|
||||||
|
#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||||
|
#' nround = 4
|
||||||
|
#'
|
||||||
|
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||||
|
#'
|
||||||
|
#' # Model accuracy without new features
|
||||||
|
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||||
|
#'
|
||||||
|
#' # Convert previous features to one hot encoding
|
||||||
|
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
|
||||||
|
#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
|
||||||
|
#'
|
||||||
|
#' # learning with new features
|
||||||
|
#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||||
|
#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||||
|
#' watchlist <- list(train = new.dtrain)
|
||||||
|
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||||
|
#'
|
||||||
|
#' # Model accuracy with new features
|
||||||
|
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||||
|
#'
|
||||||
|
#' # Here the accuracy was already good and is now perfect.
|
||||||
|
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
xgb.create.features <- function(model, training.data){
|
||||||
|
pred_with_leaf = predict(model, training.data, predleaf = TRUE)
|
||||||
|
cols <- list()
|
||||||
|
for(i in 1:length(trees)){
|
||||||
|
# max is not the real max but it s not important for the purpose of adding features
|
||||||
|
leaf.id <- sort(unique(pred_with_leaf[,i]))
|
||||||
|
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
|
||||||
|
}
|
||||||
|
cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
|
||||||
|
}
|
||||||
@@ -54,11 +54,11 @@
|
|||||||
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
|
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
|
||||||
#' If folds are supplied, the nfold and stratified parameters would be ignored.
|
#' If folds are supplied, the nfold and stratified parameters would be ignored.
|
||||||
#' @param verbose \code{boolean}, print the statistics during the process
|
#' @param verbose \code{boolean}, print the statistics during the process
|
||||||
#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered.
|
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
|
||||||
|
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
|
||||||
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||||
#' keeps getting worse consecutively for \code{k} rounds.
|
#' keeps getting worse consecutively for \code{k} rounds.
|
||||||
#' @param early.stop.round An alternative of \code{early_stop_round}.
|
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
|
||||||
#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
|
|
||||||
#' \code{maximize=TRUE} means the larger the evaluation score the better.
|
#' \code{maximize=TRUE} means the larger the evaluation score the better.
|
||||||
#'
|
#'
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
@@ -90,140 +90,158 @@
|
|||||||
#' max.depth =3, eta = 1, objective = "binary:logistic")
|
#' max.depth =3, eta = 1, objective = "binary:logistic")
|
||||||
#' print(history)
|
#' print(history)
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
|
||||||
prediction = FALSE, showsd = TRUE, metrics=list(),
|
prediction = FALSE, showsd = TRUE, metrics=list(),
|
||||||
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,
|
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
|
||||||
early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) {
|
early.stop.round = NULL, maximize = NULL, ...) {
|
||||||
if (typeof(params) != "list") {
|
if (typeof(params) != "list") {
|
||||||
stop("xgb.cv: first argument params must be list")
|
stop("xgb.cv: first argument params must be list")
|
||||||
}
|
}
|
||||||
if(!is.null(folds)) {
|
if(!is.null(folds)) {
|
||||||
if(class(folds)!="list" | length(folds) < 2) {
|
if(class(folds) != "list" | length(folds) < 2) {
|
||||||
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
|
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
|
||||||
|
}
|
||||||
|
nfold <- length(folds)
|
||||||
|
}
|
||||||
|
if (nfold <= 1) {
|
||||||
|
stop("nfold must be bigger than 1")
|
||||||
}
|
}
|
||||||
nfold <- length(folds)
|
|
||||||
}
|
|
||||||
if (nfold <= 1) {
|
|
||||||
stop("nfold must be bigger than 1")
|
|
||||||
}
|
|
||||||
if (is.null(missing)) {
|
|
||||||
dtrain <- xgb.get.DMatrix(data, label)
|
|
||||||
} else {
|
|
||||||
dtrain <- xgb.get.DMatrix(data, label, missing)
|
dtrain <- xgb.get.DMatrix(data, label, missing)
|
||||||
}
|
dot.params <- list(...)
|
||||||
params <- append(params, list(...))
|
nms.params <- names(params)
|
||||||
params <- append(params, list(silent=1))
|
nms.dot.params <- names(dot.params)
|
||||||
for (mc in metrics) {
|
if (length(intersect(nms.params,nms.dot.params)) > 0)
|
||||||
params <- append(params, list("eval_metric"=mc))
|
stop("Duplicated defined term in parameters. Please check your list of params.")
|
||||||
}
|
params <- append(params, dot.params)
|
||||||
|
params <- append(params, list(silent=1))
|
||||||
# Early Stopping
|
for (mc in metrics) {
|
||||||
if (is.null(early_stop_round) && !is.null(early.stop.round))
|
params <- append(params, list("eval_metric"=mc))
|
||||||
early_stop_round = early.stop.round
|
|
||||||
if (!is.null(early_stop_round)){
|
|
||||||
if (!is.null(feval) && is.null(maximize))
|
|
||||||
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
|
|
||||||
if (is.null(maximize) && is.null(params$eval_metric))
|
|
||||||
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
|
|
||||||
if (is.null(maximize))
|
|
||||||
{
|
|
||||||
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
|
|
||||||
maximize = FALSE
|
|
||||||
} else {
|
|
||||||
maximize = TRUE
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maximize) {
|
# customized objective and evaluation metric interface
|
||||||
bestScore = 0
|
if (!is.null(params$objective) && !is.null(obj))
|
||||||
} else {
|
stop("xgb.cv: cannot assign two different objectives")
|
||||||
bestScore = Inf
|
if (!is.null(params$objective))
|
||||||
}
|
if (class(params$objective) == 'function') {
|
||||||
bestInd = 0
|
obj <- params$objective
|
||||||
earlyStopflag = FALSE
|
params[['objective']] <- NULL
|
||||||
|
}
|
||||||
|
# if (!is.null(params$eval_metric) && !is.null(feval))
|
||||||
|
# stop("xgb.cv: cannot assign two different evaluation metrics")
|
||||||
|
if (!is.null(params$eval_metric))
|
||||||
|
if (class(params$eval_metric) == 'function') {
|
||||||
|
feval <- params$eval_metric
|
||||||
|
params[['eval_metric']] <- NULL
|
||||||
|
}
|
||||||
|
|
||||||
if (length(metrics)>1)
|
# Early Stopping
|
||||||
warning('Only the first metric is used for early stopping process.')
|
if (!is.null(early.stop.round)){
|
||||||
}
|
if (!is.null(feval) && is.null(maximize))
|
||||||
|
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
|
||||||
|
if (is.null(maximize) && is.null(params$eval_metric))
|
||||||
|
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
|
||||||
|
if (is.null(maximize))
|
||||||
|
{
|
||||||
|
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
|
||||||
|
maximize <- FALSE
|
||||||
|
} else {
|
||||||
|
maximize <- TRUE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
|
if (maximize) {
|
||||||
obj_type = params[['objective']]
|
bestScore <- 0
|
||||||
mat_pred = FALSE
|
|
||||||
if (!is.null(obj_type) && obj_type=='multi:softprob')
|
|
||||||
{
|
|
||||||
num_class = params[['num_class']]
|
|
||||||
if (is.null(num_class))
|
|
||||||
stop('must set num_class to use softmax')
|
|
||||||
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
|
|
||||||
mat_pred = TRUE
|
|
||||||
}
|
|
||||||
else
|
|
||||||
predictValues <- rep(0,xgb.numrow(dtrain))
|
|
||||||
history <- c()
|
|
||||||
for (i in 1:nrounds) {
|
|
||||||
msg <- list()
|
|
||||||
for (k in 1:nfold) {
|
|
||||||
fd <- xgb_folds[[k]]
|
|
||||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
|
||||||
if (i<nrounds) {
|
|
||||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
|
||||||
} else {
|
|
||||||
if (!prediction) {
|
|
||||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
|
||||||
} else {
|
} else {
|
||||||
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
|
bestScore <- Inf
|
||||||
if (mat_pred) {
|
|
||||||
pred_mat = matrix(res[[2]],num_class,length(fd$index))
|
|
||||||
predictValues[fd$index,] <- t(pred_mat)
|
|
||||||
} else {
|
|
||||||
predictValues[fd$index] <- res[[2]]
|
|
||||||
}
|
|
||||||
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
|
|
||||||
}
|
}
|
||||||
}
|
bestInd <- 0
|
||||||
}
|
earlyStopflag <- FALSE
|
||||||
ret <- xgb.cv.aggcv(msg, showsd)
|
|
||||||
history <- c(history, ret)
|
|
||||||
if(verbose) paste(ret, "\n", sep="") %>% cat
|
|
||||||
|
|
||||||
# early_Stopping
|
if (length(metrics) > 1)
|
||||||
if (!is.null(early_stop_round)){
|
warning('Only the first metric is used for early stopping process.')
|
||||||
score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1]
|
|
||||||
score = strsplit(score,'\\+|:')[[1]][[2]]
|
|
||||||
score = as.numeric(score)
|
|
||||||
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
|
|
||||||
bestScore = score
|
|
||||||
bestInd = i
|
|
||||||
} else {
|
|
||||||
if (i-bestInd>=early_stop_round) {
|
|
||||||
earlyStopflag = TRUE
|
|
||||||
cat('Stopping. Best iteration:',bestInd)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
|
||||||
|
obj_type <- params[['objective']]
|
||||||
|
mat_pred <- FALSE
|
||||||
|
if (!is.null(obj_type) && obj_type == 'multi:softprob')
|
||||||
|
{
|
||||||
|
num_class <- params[['num_class']]
|
||||||
|
if (is.null(num_class))
|
||||||
|
stop('must set num_class to use softmax')
|
||||||
|
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
|
||||||
|
mat_pred <- TRUE
|
||||||
|
}
|
||||||
|
else
|
||||||
|
predictValues <- rep(0,xgb.numrow(dtrain))
|
||||||
|
history <- c()
|
||||||
|
print.every.n <- max(as.integer(print.every.n), 1L)
|
||||||
|
for (i in 1:nrounds) {
|
||||||
|
msg <- list()
|
||||||
|
for (k in 1:nfold) {
|
||||||
|
fd <- xgb_folds[[k]]
|
||||||
|
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||||
|
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||||
|
}
|
||||||
|
ret <- xgb.cv.aggcv(msg, showsd)
|
||||||
|
history <- c(history, ret)
|
||||||
|
if(verbose)
|
||||||
|
if (0 == (i - 1L) %% print.every.n)
|
||||||
|
cat(ret, "\n", sep="")
|
||||||
|
|
||||||
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
|
# early_Stopping
|
||||||
colnamesMean <- paste(colnames, "mean")
|
if (!is.null(early.stop.round)){
|
||||||
if(showsd) colnamesStd <- paste(colnames, "std")
|
score <- strsplit(ret,'\\s+')[[1]][2 + length(metrics)]
|
||||||
|
score <- strsplit(score,'\\+|:')[[1]][[2]]
|
||||||
|
score <- as.numeric(score)
|
||||||
|
if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
|
||||||
|
bestScore <- score
|
||||||
|
bestInd <- i
|
||||||
|
} else {
|
||||||
|
if (i - bestInd >= early.stop.round) {
|
||||||
|
earlyStopflag <- TRUE
|
||||||
|
cat('Stopping. Best iteration:', bestInd, '\n')
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
colnames <- c()
|
if (prediction) {
|
||||||
if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
for (k in 1:nfold) {
|
||||||
else colnames <- colnamesMean
|
fd <- xgb_folds[[k]]
|
||||||
|
if (!is.null(early.stop.round) && earlyStopflag) {
|
||||||
|
res <- xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction)
|
||||||
|
} else {
|
||||||
|
res <- xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction)
|
||||||
|
}
|
||||||
|
if (mat_pred) {
|
||||||
|
pred_mat <- matrix(res[[2]],num_class,length(fd$index))
|
||||||
|
predictValues[fd$index,] <- t(pred_mat)
|
||||||
|
} else {
|
||||||
|
predictValues[fd$index] <- res[[2]]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type <- rep(x = "numeric", times = length(colnames))
|
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
|
||||||
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
colnamesMean <- paste(colnames, "mean")
|
||||||
split <- str_split(string = history, pattern = "\t")
|
if(showsd) colnamesStd <- paste(colnames, "std")
|
||||||
|
|
||||||
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
|
colnames <- c()
|
||||||
|
if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
||||||
|
else colnames <- colnamesMean
|
||||||
|
|
||||||
if (prediction) {
|
type <- rep(x = "numeric", times = length(colnames))
|
||||||
return(list(dt = dt,pred = predictValues))
|
dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
||||||
}
|
split <- str_split(string = history, pattern = "\t")
|
||||||
return(dt)
|
|
||||||
|
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist( list( dt, .), use.names = F, fill = F)}
|
||||||
|
|
||||||
|
if (prediction) {
|
||||||
|
return( list( dt = dt,pred = predictValues))
|
||||||
|
}
|
||||||
|
return(dt)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
|
|||||||
@@ -36,7 +36,6 @@
|
|||||||
#' # print the model without saving it to a file
|
#' # print the model without saving it to a file
|
||||||
#' print(xgb.dump(bst))
|
#' print(xgb.dump(bst))
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
|
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
|
||||||
if (class(model) != "xgb.Booster") {
|
if (class(model) != "xgb.Booster") {
|
||||||
stop("model: argument must be type xgb.Booster")
|
stop("model: argument must be type xgb.Booster")
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
#' Show importance of features in a model
|
#' Show importance of features in a model
|
||||||
#'
|
#'
|
||||||
#' Read a xgboost model text dump.
|
#' Create a \code{data.table} of the most important features of a model.
|
||||||
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
|
||||||
#'
|
#'
|
||||||
#' @importFrom data.table data.table
|
#' @importFrom data.table data.table
|
||||||
#' @importFrom data.table setnames
|
#' @importFrom data.table setnames
|
||||||
@@ -11,34 +10,30 @@
|
|||||||
#' @importFrom Matrix cBind
|
#' @importFrom Matrix cBind
|
||||||
#' @importFrom Matrix sparseVector
|
#' @importFrom Matrix sparseVector
|
||||||
#'
|
#'
|
||||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||||
#'
|
#' @param model generated by the \code{xgb.train} function.
|
||||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
|
|
||||||
#'
|
|
||||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
|
||||||
#'
|
|
||||||
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||||
#'
|
|
||||||
#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||||
#'
|
|
||||||
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
|
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
|
||||||
#'
|
#'
|
||||||
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' This is the function to understand the model trained (and through your model, your data).
|
#' This function is for both linear and tree models.
|
||||||
#'
|
|
||||||
#' Results are returned for both linear and tree models.
|
|
||||||
#'
|
#'
|
||||||
#' \code{data.table} is returned by the function.
|
#' \code{data.table} is returned by the function.
|
||||||
#' There are 3 columns :
|
#' The columns are :
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
||||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
||||||
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
|
||||||
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
|
#' If you don't provide \code{feature_names}, index of the features will be used instead.
|
||||||
|
#'
|
||||||
|
#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
|
||||||
|
#'
|
||||||
#' Co-occurence count
|
#' Co-occurence count
|
||||||
#' ------------------
|
#' ------------------
|
||||||
#'
|
#'
|
||||||
@@ -51,35 +46,26 @@
|
|||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#'
|
#'
|
||||||
#' # Both dataset are list with two items, a sparse matrix and labels
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
#' # (labels = outcome column which will be learned).
|
|
||||||
#' # Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
#' train <- agaricus.train
|
|
||||||
#'
|
|
||||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
|
||||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
#' xgb.importance(train$data@@Dimnames[[2]], model = bst)
|
#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||||
#'
|
#'
|
||||||
#' # Same thing with co-occurence computation this time
|
#' # Same thing with co-occurence computation this time
|
||||||
#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
|
#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ((x + label) == 2)){
|
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
||||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
if (class(model) != "xgb.Booster") {
|
||||||
stop("filename_dump: Has to be a path to the model dump file.")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
|
||||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if((is.null(data) & !is.null(label)) |(!is.null(data) & is.null(label))) {
|
if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) {
|
||||||
stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
|
stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,17 +73,24 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
|
|||||||
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
|
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
|
||||||
}
|
}
|
||||||
|
|
||||||
if(is.null(model)){
|
treeDump <- function(feature_names, text, keepDetail){
|
||||||
text <- readLines(filename_dump)
|
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
|
||||||
} else {
|
xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
|
||||||
text <- xgb.dump(model = model, with.stats = T)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(text[2] == "bias:"){
|
linearDump <- function(feature_names, text){
|
||||||
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
|
weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric
|
||||||
|
if(is.null(feature_names)) feature_names <- seq(to = length(weights))
|
||||||
|
data.table(Feature = feature_names, Weight = weights)
|
||||||
|
}
|
||||||
|
|
||||||
|
model.text.dump <- xgb.dump(model = model, with.stats = T)
|
||||||
|
|
||||||
|
if(model.text.dump[2] == "bias:"){
|
||||||
|
result <- model.text.dump %>% linearDump(feature_names, .)
|
||||||
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
|
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
|
||||||
} else {
|
} else {
|
||||||
result <- treeDump(feature_names, text = text, keepDetail = !is.null(data))
|
result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data))
|
||||||
|
|
||||||
# Co-occurence computation
|
# Co-occurence computation
|
||||||
if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
|
if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
|
||||||
@@ -110,24 +103,12 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
|
|||||||
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
|
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
|
||||||
apply(c & d, 2, . %>% target %>% sum) -> vec
|
apply(c & d, 2, . %>% target %>% sum) -> vec
|
||||||
|
|
||||||
result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL]
|
result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
treeDump <- function(feature_names, text, keepDetail){
|
|
||||||
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
|
|
||||||
|
|
||||||
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)]
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|
||||||
linearDump <- function(feature_names, text){
|
|
||||||
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
|
|||||||
@@ -15,7 +15,6 @@
|
|||||||
#' bst <- xgb.load('xgb.model')
|
#' bst <- xgb.load('xgb.model')
|
||||||
#' pred <- predict(bst, test$data)
|
#' pred <- predict(bst, test$data)
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
xgb.load <- function(modelfile) {
|
xgb.load <- function(modelfile) {
|
||||||
if (is.null(modelfile))
|
if (is.null(modelfile))
|
||||||
stop("xgb.load: modelfile cannot be NULL")
|
stop("xgb.load: modelfile cannot be NULL")
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
#' Convert tree model dump to data.table
|
#' Parse boosted tree model text dump
|
||||||
#'
|
#'
|
||||||
#' Read a tree model text dump and return a data.table.
|
#' Parse a boosted tree model text dump and return a \code{data.table}.
|
||||||
#'
|
#'
|
||||||
#' @importFrom data.table data.table
|
#' @importFrom data.table data.table
|
||||||
#' @importFrom data.table set
|
#' @importFrom data.table set
|
||||||
@@ -12,20 +12,20 @@
|
|||||||
#' @importFrom magrittr add
|
#' @importFrom magrittr add
|
||||||
#' @importFrom stringr str_extract
|
#' @importFrom stringr str_extract
|
||||||
#' @importFrom stringr str_split
|
#' @importFrom stringr str_split
|
||||||
#' @importFrom stringr str_extract
|
|
||||||
#' @importFrom stringr str_trim
|
#' @importFrom stringr str_trim
|
||||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
|
||||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
#' @param model object created by the \code{xgb.train} function.
|
||||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
|
||||||
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
|
||||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
|
||||||
#'
|
#'
|
||||||
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
|
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
|
#' General function to convert a text dump of tree model to a \code{data.table}.
|
||||||
#'
|
#'
|
||||||
#' The content of the \code{data.table} is organised that way:
|
#' The purpose is to help user to explore the model and get a better understanding of it.
|
||||||
|
#'
|
||||||
|
#' The columns of the \code{data.table} are:
|
||||||
#'
|
#'
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{ID}: unique identifier of a node ;
|
#' \item \code{ID}: unique identifier of a node ;
|
||||||
@@ -37,56 +37,40 @@
|
|||||||
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
|
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||||
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
|
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||||
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
|
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||||
#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
|
#' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#'
|
#'
|
||||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
#' #(labels = outcome column which will be learned).
|
|
||||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
#' train <- agaricus.train
|
|
||||||
#'
|
|
||||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
|
||||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||||
|
|
||||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||||
}
|
}
|
||||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
|
||||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
|
||||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
|
||||||
stop("filename_dump: path to the model doesn't exist.")
|
|
||||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
|
||||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
if (class(model) != "xgb.Booster" & class(text) != "character") {
|
||||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
"model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>%
|
||||||
}
|
paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>%
|
||||||
|
stop()
|
||||||
if (!class(text) %in% c("character", "NULL")) {
|
|
||||||
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
|
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
|
||||||
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!is.null(model)){
|
if(is.null(text)){
|
||||||
text = xgb.dump(model = model, with.stats = T)
|
text <- xgb.dump(model = model, with.stats = T)
|
||||||
} else if(!is.null(filename_dump)){
|
|
||||||
text <- readLines(filename_dump) %>% str_trim(side = "both")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
|
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
|
||||||
|
|
||||||
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
||||||
|
|
||||||
@@ -96,15 +80,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
|||||||
|
|
||||||
allTrees <- data.table()
|
allTrees <- data.table()
|
||||||
|
|
||||||
anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
|
anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
|
||||||
for(i in 1:n_round){
|
for (i in 1:n_round){
|
||||||
|
|
||||||
tree <- text[(position[i]+1):(position[i+1]-1)]
|
tree <- text[(position[i] + 1):(position[i + 1] - 1)]
|
||||||
|
|
||||||
# avoid tree made of a leaf only (no split)
|
# avoid tree made of a leaf only (no split)
|
||||||
if(length(tree) <2) next
|
if(length(tree) < 2) next
|
||||||
|
|
||||||
treeID <- i-1
|
treeID <- i - 1
|
||||||
|
|
||||||
notLeaf <- str_match(tree, "leaf") %>% is.na
|
notLeaf <- str_match(tree, "leaf") %>% is.na
|
||||||
leaf <- notLeaf %>% not %>% tree[.]
|
leaf <- notLeaf %>% not %>% tree[.]
|
||||||
@@ -128,38 +112,37 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
|||||||
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
|
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
|
||||||
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
||||||
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
||||||
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
|
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID]
|
||||||
|
|
||||||
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
|
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
|
||||||
}
|
}
|
||||||
|
|
||||||
yes <- allTrees[!is.na(Yes),Yes]
|
yes <- allTrees[!is.na(Yes), Yes]
|
||||||
|
|
||||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
||||||
j = "Yes.Feature",
|
j = "Yes.Feature",
|
||||||
value = allTrees[ID == yes,Feature])
|
value = allTrees[ID %in% yes, Feature])
|
||||||
|
|
||||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
||||||
j = "Yes.Cover",
|
j = "Yes.Cover",
|
||||||
value = allTrees[ID == yes,Cover])
|
value = allTrees[ID %in% yes, Cover])
|
||||||
|
|
||||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
||||||
j = "Yes.Quality",
|
j = "Yes.Quality",
|
||||||
value = allTrees[ID == yes,Quality])
|
value = allTrees[ID %in% yes, Quality])
|
||||||
|
no <- allTrees[!is.na(No), No]
|
||||||
|
|
||||||
no <- allTrees[!is.na(No),No]
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
||||||
|
|
||||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
|
||||||
j = "No.Feature",
|
j = "No.Feature",
|
||||||
value = allTrees[ID == no,Feature])
|
value = allTrees[ID %in% no, Feature])
|
||||||
|
|
||||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
||||||
j = "No.Cover",
|
j = "No.Cover",
|
||||||
value = allTrees[ID == no,Cover])
|
value = allTrees[ID %in% no, Cover])
|
||||||
|
|
||||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
||||||
j = "No.Quality",
|
j = "No.Quality",
|
||||||
value = allTrees[ID == no,Quality])
|
value = allTrees[ID %in% no, Quality])
|
||||||
|
|
||||||
allTrees
|
allTrees
|
||||||
}
|
}
|
||||||
@@ -167,4 +150,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
|||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
|
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency"))
|
||||||
160
R-package/R/xgb.plot.deepness.R
Normal file
160
R-package/R/xgb.plot.deepness.R
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
#' Plot multiple graphs at the same time
|
||||||
|
#'
|
||||||
|
#' Plot multiple graph aligned by rows and columns.
|
||||||
|
#'
|
||||||
|
#' @importFrom data.table data.table
|
||||||
|
#' @param cols number of columns
|
||||||
|
#' @return NULL
|
||||||
|
multiplot <- function(..., cols = 1) {
|
||||||
|
plots <- list(...)
|
||||||
|
numPlots = length(plots)
|
||||||
|
|
||||||
|
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
|
||||||
|
ncol = cols, nrow = ceiling(numPlots / cols))
|
||||||
|
|
||||||
|
if (numPlots == 1) {
|
||||||
|
print(plots[[1]])
|
||||||
|
} else {
|
||||||
|
grid::grid.newpage()
|
||||||
|
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
|
||||||
|
for (i in 1:numPlots) {
|
||||||
|
# Get the i,j matrix positions of the regions that contain this subplot
|
||||||
|
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
|
||||||
|
|
||||||
|
print(
|
||||||
|
plots[[i]], vp = grid::viewport(
|
||||||
|
layout.pos.row = matchidx$row,
|
||||||
|
layout.pos.col = matchidx$col
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Parse the graph to extract vector of edges
|
||||||
|
#' @param element igraph object containing the path from the root to the leaf.
|
||||||
|
edge.parser <- function(element) {
|
||||||
|
edges.vector <- igraph::as_ids(element)
|
||||||
|
t <- tail(edges.vector, n = 1)
|
||||||
|
l <- length(edges.vector)
|
||||||
|
list(t,l)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Extract path from root to leaf from data.table
|
||||||
|
#' @param dt.tree data.table containing the nodes and edges of the trees
|
||||||
|
get.paths.to.leaf <- function(dt.tree) {
|
||||||
|
dt.not.leaf.edges <-
|
||||||
|
dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
|
||||||
|
|
||||||
|
trees <- dt.tree[,unique(Tree)]
|
||||||
|
|
||||||
|
paths <- list()
|
||||||
|
for (tree in trees) {
|
||||||
|
graph <-
|
||||||
|
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
|
||||||
|
paths.tmp <-
|
||||||
|
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
|
||||||
|
Feature == "Leaf", c(ID)])
|
||||||
|
paths <- c(paths, paths.tmp$vpath)
|
||||||
|
}
|
||||||
|
paths
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Plot model trees deepness
|
||||||
|
#'
|
||||||
|
#' Generate a graph to plot the distribution of deepness among trees.
|
||||||
|
#'
|
||||||
|
#' @importFrom data.table data.table
|
||||||
|
#' @importFrom data.table rbindlist
|
||||||
|
#' @importFrom data.table setnames
|
||||||
|
#' @importFrom data.table :=
|
||||||
|
#' @importFrom magrittr %>%
|
||||||
|
#' @param model dump generated by the \code{xgb.train} function.
|
||||||
|
#'
|
||||||
|
#' @return Two graphs showing the distribution of the model deepness.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||||
|
#' by tree deepness level.
|
||||||
|
#'
|
||||||
|
#' The purpose of this function is to help the user to find the best trade-off to set
|
||||||
|
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||||
|
#'
|
||||||
|
#' See \link{xgb.train} for more information about these parameters.
|
||||||
|
#'
|
||||||
|
#' The graph is made of two parts:
|
||||||
|
#'
|
||||||
|
#' \itemize{
|
||||||
|
#' \item Count: number of leaf per level of deepness;
|
||||||
|
#' \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(agaricus.train, package='xgboost')
|
||||||
|
#'
|
||||||
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
|
||||||
|
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||||
|
#' min_child_weight = 50)
|
||||||
|
#'
|
||||||
|
#' xgb.plot.deepness(model = bst)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
xgb.plot.deepness <- function(model = NULL) {
|
||||||
|
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||||
|
stop("ggplot2 package is required for plotting the graph deepness.",
|
||||||
|
call. = FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!requireNamespace("igraph", quietly = TRUE)) {
|
||||||
|
stop("igraph package is required for plotting the graph deepness.",
|
||||||
|
call. = FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!requireNamespace("grid", quietly = TRUE)) {
|
||||||
|
stop("grid package is required for plotting the graph deepness.",
|
||||||
|
call. = FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (class(model) != "xgb.Booster") {
|
||||||
|
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||||
|
}
|
||||||
|
|
||||||
|
dt.tree <- xgb.model.dt.tree(model = model)
|
||||||
|
|
||||||
|
dt.edge.elements <- data.table()
|
||||||
|
paths <- get.paths.to.leaf(dt.tree)
|
||||||
|
|
||||||
|
dt.edge.elements <-
|
||||||
|
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
|
||||||
|
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
|
||||||
|
|
||||||
|
dt.edge.summuize <-
|
||||||
|
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
|
||||||
|
|
||||||
|
p1 <-
|
||||||
|
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
|
||||||
|
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
|
||||||
|
ggplot2::theme(
|
||||||
|
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
|
||||||
|
panel.grid.major.y = ggplot2::element_blank(),
|
||||||
|
axis.ticks = ggplot2::element_blank(),
|
||||||
|
axis.text.x = ggplot2::element_blank()
|
||||||
|
)
|
||||||
|
|
||||||
|
p2 <-
|
||||||
|
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
|
||||||
|
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
|
||||||
|
|
||||||
|
multiplot(p1,p2,cols = 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Avoid error messages during CRAN check.
|
||||||
|
# The reason is that these variables are never declared
|
||||||
|
# They are mainly column names inferred by Data.table...
|
||||||
|
globalVariables(
|
||||||
|
c(
|
||||||
|
"Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
#' Plot feature importance bar graph
|
#' Plot feature importance bar graph
|
||||||
#'
|
#'
|
||||||
#' Read a data.table containing feature importance details and plot it.
|
#' Read a data.table containing feature importance details and plot it (for both GLM and Trees).
|
||||||
#'
|
#'
|
||||||
#' @importFrom magrittr %>%
|
#' @importFrom magrittr %>%
|
||||||
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
|
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
|
||||||
@@ -10,7 +10,7 @@
|
|||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' The purpose of this function is to easily represent the importance of each feature of a model.
|
#' The purpose of this function is to easily represent the importance of each feature of a model.
|
||||||
#' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
#' The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
||||||
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
@@ -19,39 +19,61 @@
|
|||||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||||
#' #(labels = outcome column which will be learned).
|
#' #(labels = outcome column which will be learned).
|
||||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||||
#' train <- agaricus.train
|
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
#' #agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
#' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst)
|
#' importance_matrix <- xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||||
#' xgb.plot.importance(importance_matrix)
|
#' xgb.plot.importance(importance_matrix)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
|
xgb.plot.importance <-
|
||||||
if (!"data.table" %in% class(importance_matrix)) {
|
function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
|
||||||
stop("importance_matrix: Should be a data.table.")
|
if (!"data.table" %in% class(importance_matrix)) {
|
||||||
|
stop("importance_matrix: Should be a data.table.")
|
||||||
|
}
|
||||||
|
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||||
|
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
|
||||||
|
}
|
||||||
|
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
|
||||||
|
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Gain", "Cover", "Frequency")))){
|
||||||
|
y.axe.name <- "Gain"
|
||||||
|
} else if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Weight")))){
|
||||||
|
y.axe.name <- "Weight"
|
||||||
|
} else {
|
||||||
|
stop("Importance matrix is not correct (column names issue)")
|
||||||
|
}
|
||||||
|
|
||||||
|
# To avoid issues in clustering when co-occurences are used
|
||||||
|
importance_matrix <-
|
||||||
|
importance_matrix[, .(Gain.or.Weight = sum(get(y.axe.name))), by = Feature]
|
||||||
|
|
||||||
|
clusters <-
|
||||||
|
suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain.or.Weight], numberOfClusters))
|
||||||
|
importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
|
||||||
|
|
||||||
|
plot <-
|
||||||
|
ggplot2::ggplot(
|
||||||
|
importance_matrix, ggplot2::aes(
|
||||||
|
x = stats::reorder(Feature, Gain.or.Weight), y = Gain.or.Weight, width = 0.05
|
||||||
|
), environment = environment()
|
||||||
|
) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
|
||||||
|
"identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab(y.axe.name) + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
|
||||||
|
plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
|
||||||
|
)
|
||||||
|
|
||||||
|
return(plot)
|
||||||
}
|
}
|
||||||
if (!require(ggplot2, quietly = TRUE)) {
|
|
||||||
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
|
|
||||||
}
|
|
||||||
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
|
|
||||||
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
# To avoid issues in clustering when co-occurences are used
|
|
||||||
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
|
|
||||||
|
|
||||||
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
|
|
||||||
importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
|
|
||||||
|
|
||||||
plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
|
|
||||||
|
|
||||||
return(plot)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
|
globalVariables(
|
||||||
|
c(
|
||||||
|
"Feature", "Gain.or.Weight", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "Gain.or.Weight"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|||||||
114
R-package/R/xgb.plot.multi.trees.R
Normal file
114
R-package/R/xgb.plot.multi.trees.R
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
#' Project all trees on one tree and plot it
|
||||||
|
#'
|
||||||
|
#' Visualization of the ensemble of trees as a single collective unit.
|
||||||
|
#'
|
||||||
|
#' @importFrom data.table data.table
|
||||||
|
#' @importFrom data.table rbindlist
|
||||||
|
#' @importFrom data.table setnames
|
||||||
|
#' @importFrom data.table :=
|
||||||
|
#' @importFrom magrittr %>%
|
||||||
|
#' @importFrom stringr str_detect
|
||||||
|
#' @importFrom stringr str_extract
|
||||||
|
#'
|
||||||
|
#' @param model dump generated by the \code{xgb.train} function.
|
||||||
|
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||||
|
#' @param features.keep number of features to keep in each position of the multi trees.
|
||||||
|
#' @param plot.width width in pixels of the graph to produce
|
||||||
|
#' @param plot.height height in pixels of the graph to produce
|
||||||
|
#'
|
||||||
|
#' @return Two graphs showing the distribution of the model deepness.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#'
|
||||||
|
#' This function tries to capture the complexity of gradient boosted tree ensemble
|
||||||
|
#' in a cohesive way.
|
||||||
|
#'
|
||||||
|
#' The goal is to improve the interpretability of the model generally seen as black box.
|
||||||
|
#' The function is dedicated to boosting applied to decision trees only.
|
||||||
|
#'
|
||||||
|
#' The purpose is to move from an ensemble of trees to a single tree only.
|
||||||
|
#'
|
||||||
|
#' It takes advantage of the fact that the shape of a binary tree is only defined by
|
||||||
|
#' its deepness (therefore in a boosting model, all trees have the same shape).
|
||||||
|
#'
|
||||||
|
#' Moreover, the trees tend to reuse the same features.
|
||||||
|
#'
|
||||||
|
#' The function will project each tree on one, and keep for each position the
|
||||||
|
#' \code{features.keep} first features (based on Gain per feature measure).
|
||||||
|
#'
|
||||||
|
#' This function is inspired by this blog post:
|
||||||
|
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(agaricus.train, package='xgboost')
|
||||||
|
#'
|
||||||
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
|
||||||
|
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||||
|
#' min_child_weight = 50)
|
||||||
|
#'
|
||||||
|
#' p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
|
||||||
|
#' print(p)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
xgb.plot.multi.trees <- function(model, feature_names = NULL, features.keep = 5, plot.width = NULL, plot.height = NULL){
|
||||||
|
tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
|
||||||
|
|
||||||
|
# first number of the path represents the tree, then the following numbers are related to the path to follow
|
||||||
|
# root init
|
||||||
|
root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
|
||||||
|
tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
|
||||||
|
|
||||||
|
precedent.nodes <- root.nodes
|
||||||
|
|
||||||
|
while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
|
||||||
|
yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
|
||||||
|
no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
|
||||||
|
yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0")
|
||||||
|
no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1")
|
||||||
|
|
||||||
|
tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos]
|
||||||
|
tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos]
|
||||||
|
precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
|
||||||
|
}
|
||||||
|
|
||||||
|
tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")]
|
||||||
|
tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
remove.tree <- . %>% str_replace(pattern = "^\\d+-", replacement = "")
|
||||||
|
|
||||||
|
tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
|
||||||
|
|
||||||
|
nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), features.keep)], " (", Quality[1:min(length(Quality), features.keep)], ")") %>% paste0(collapse = "\n")), by=abs.node.position]
|
||||||
|
edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL]
|
||||||
|
|
||||||
|
nodes <- DiagrammeR::create_nodes(nodes = nodes.dt[,abs.node.position],
|
||||||
|
label = nodes.dt[,Text],
|
||||||
|
style = "filled",
|
||||||
|
color = "DimGray",
|
||||||
|
fillcolor= "Beige",
|
||||||
|
shape = "oval",
|
||||||
|
fontname = "Helvetica"
|
||||||
|
)
|
||||||
|
|
||||||
|
edges <- DiagrammeR::create_edges(from = edges.dt[,From],
|
||||||
|
to = edges.dt[,To],
|
||||||
|
color = "DimGray",
|
||||||
|
arrowsize = "1.5",
|
||||||
|
arrowhead = "vee",
|
||||||
|
fontname = "Helvetica",
|
||||||
|
rel = "leading_to")
|
||||||
|
|
||||||
|
graph <- DiagrammeR::create_graph(nodes_df = nodes,
|
||||||
|
edges_df = edges,
|
||||||
|
graph_attrs = "rankdir = LR")
|
||||||
|
|
||||||
|
DiagrammeR::render_graph(graph, width = plot.width, height = plot.height)
|
||||||
|
}
|
||||||
|
|
||||||
|
globalVariables(
|
||||||
|
c(
|
||||||
|
"Feature", "no.nodes.abs.pos", "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -1,27 +1,15 @@
|
|||||||
#' Plot a boosted tree model
|
#' Plot a boosted tree model
|
||||||
#'
|
#'
|
||||||
#' Read a tree model text dump.
|
#' Read a tree model text dump and plot the model.
|
||||||
#' Plotting only works for boosted tree model (not linear model).
|
|
||||||
#'
|
#'
|
||||||
#' @importFrom data.table data.table
|
#' @importFrom data.table data.table
|
||||||
#' @importFrom data.table set
|
|
||||||
#' @importFrom data.table rbindlist
|
|
||||||
#' @importFrom data.table :=
|
#' @importFrom data.table :=
|
||||||
#' @importFrom data.table copy
|
|
||||||
#' @importFrom magrittr %>%
|
#' @importFrom magrittr %>%
|
||||||
#' @importFrom magrittr not
|
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||||
#' @importFrom magrittr add
|
|
||||||
#' @importFrom stringr str_extract
|
|
||||||
#' @importFrom stringr str_split
|
|
||||||
#' @importFrom stringr str_extract
|
|
||||||
#' @importFrom stringr str_trim
|
|
||||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
|
||||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
|
|
||||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||||
#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
|
#' @param plot.width the width of the diagram in pixels.
|
||||||
#' @param width the width of the diagram in pixels.
|
#' @param plot.height the height of the diagram in pixels.
|
||||||
#' @param height the height of the diagram in pixels.
|
|
||||||
#'
|
#'
|
||||||
#' @return A \code{DiagrammeR} of the model.
|
#' @return A \code{DiagrammeR} of the model.
|
||||||
#'
|
#'
|
||||||
@@ -30,37 +18,26 @@
|
|||||||
#' The content of each node is organised that way:
|
#' The content of each node is organised that way:
|
||||||
#'
|
#'
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{feature} value ;
|
#' \item \code{feature} value;
|
||||||
#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
|
#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
|
||||||
#' \item \code{gain}: metric the importance of the node in the model.
|
#' \item \code{gain}: metric the importance of the node in the model.
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
|
#' The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
|
||||||
#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
|
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#'
|
#'
|
||||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
#' #(labels = outcome column which will be learned).
|
|
||||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
#' train <- agaricus.train
|
|
||||||
#'
|
|
||||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
|
||||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
#' xgb.plot.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){
|
||||||
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){
|
|
||||||
|
|
||||||
if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
|
if (class(model) != "xgb.Booster") {
|
||||||
stop("style: Has to be a character vector of size 1.")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
|
||||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,30 +45,40 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
|
|||||||
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
|
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
if(is.null(model)){
|
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
|
||||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
|
|
||||||
} else {
|
|
||||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
|
|
||||||
}
|
|
||||||
|
|
||||||
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
|
allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)]
|
||||||
|
allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
|
||||||
|
allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
|
||||||
|
|
||||||
allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
|
# rev is used to put the first tree on top.
|
||||||
|
nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID] %>% rev,
|
||||||
|
label = allTrees[,label] %>% rev,
|
||||||
|
style = "filled",
|
||||||
|
color = "DimGray",
|
||||||
|
fillcolor= allTrees[,filledcolor] %>% rev,
|
||||||
|
shape = allTrees[,shape] %>% rev,
|
||||||
|
data = allTrees[,Feature] %>% rev,
|
||||||
|
fontname = "Helvetica"
|
||||||
|
)
|
||||||
|
|
||||||
|
edges <- DiagrammeR::create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2),
|
||||||
|
to = allTrees[Feature != "Leaf", c(Yes, No)],
|
||||||
|
label = allTrees[Feature != "Leaf", paste("<",Split)] %>% c(rep("",nrow(allTrees[Feature != "Leaf"]))),
|
||||||
|
color = "DimGray",
|
||||||
|
arrowsize = "1.5",
|
||||||
|
arrowhead = "vee",
|
||||||
|
fontname = "Helvetica",
|
||||||
|
rel = "leading_to")
|
||||||
|
|
||||||
if(is.null(CSSstyle)){
|
graph <- DiagrammeR::create_graph(nodes_df = nodes,
|
||||||
CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
|
edges_df = edges,
|
||||||
}
|
graph_attrs = "rankdir = LR")
|
||||||
|
|
||||||
yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
|
DiagrammeR::render_graph(graph, width = plot.width, height = plot.height)
|
||||||
|
|
||||||
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
|
|
||||||
|
|
||||||
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
|
|
||||||
DiagrammeR::mermaid(path, width, height)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))
|
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label"))
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
#' bst <- xgb.load('xgb.model')
|
#' bst <- xgb.load('xgb.model')
|
||||||
#' pred <- predict(bst, test$data)
|
#' pred <- predict(bst, test$data)
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
xgb.save <- function(model, fname) {
|
xgb.save <- function(model, fname) {
|
||||||
if (typeof(fname) != "character") {
|
if (typeof(fname) != "character") {
|
||||||
stop("xgb.save: fname must be character")
|
stop("xgb.save: fname must be character")
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
#' bst <- xgb.load(raw)
|
#' bst <- xgb.load(raw)
|
||||||
#' pred <- predict(bst, test$data)
|
#' pred <- predict(bst, test$data)
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
xgb.save.raw <- function(model) {
|
xgb.save.raw <- function(model) {
|
||||||
if (class(model) == "xgb.Booster"){
|
if (class(model) == "xgb.Booster"){
|
||||||
model <- model$handle
|
model <- model$handle
|
||||||
|
|||||||
@@ -19,7 +19,7 @@
|
|||||||
#' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
|
#' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
|
||||||
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||||
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
||||||
#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
#' \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||||
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
||||||
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||||
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
||||||
@@ -36,19 +36,19 @@
|
|||||||
#' 3. Task Parameters
|
#' 3. Task Parameters
|
||||||
#'
|
#'
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
|
#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{reg:linear} linear regression (Default).
|
#' \item \code{reg:linear} linear regression (Default).
|
||||||
#' \item \code{reg:logistic} logistic regression.
|
#' \item \code{reg:logistic} logistic regression.
|
||||||
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||||
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||||
#' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
|
#' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
|
||||||
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
|
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}.
|
||||||
#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
|
#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
|
||||||
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||||
#' }
|
#' }
|
||||||
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||||
#' \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' @param data takes an \code{xgb.DMatrix} as the input.
|
#' @param data takes an \code{xgb.DMatrix} as the input.
|
||||||
@@ -66,13 +66,14 @@
|
|||||||
#' prediction and dtrain,
|
#' prediction and dtrain,
|
||||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||||
#' information of performance. If 2, xgboost will print information of both
|
#' information of performance. If 2, xgboost will print information of both
|
||||||
#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
|
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
|
||||||
#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered.
|
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
|
||||||
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||||
#' keeps getting worse consecutively for \code{k} rounds.
|
#' keeps getting worse consecutively for \code{k} rounds.
|
||||||
#' @param early.stop.round An alternative of \code{early_stop_round}.
|
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
|
||||||
#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
|
|
||||||
#' \code{maximize=TRUE} means the larger the evaluation score the better.
|
#' \code{maximize=TRUE} means the larger the evaluation score the better.
|
||||||
|
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
|
||||||
|
#' @param save_name the name or path for periodically saved model file.
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
@@ -88,6 +89,7 @@
|
|||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||||
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||||
|
#' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
|
||||||
#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
|
#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
|
||||||
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
|
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
|
||||||
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||||
@@ -103,7 +105,6 @@
|
|||||||
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
#' dtest <- dtrain
|
#' dtest <- dtrain
|
||||||
#' watchlist <- list(eval = dtest, train = dtrain)
|
#' watchlist <- list(eval = dtest, train = dtrain)
|
||||||
#' param <- list(max.depth = 2, eta = 1, silent = 1)
|
|
||||||
#' logregobj <- function(preds, dtrain) {
|
#' logregobj <- function(preds, dtrain) {
|
||||||
#' labels <- getinfo(dtrain, "label")
|
#' labels <- getinfo(dtrain, "label")
|
||||||
#' preds <- 1/(1 + exp(-preds))
|
#' preds <- 1/(1 + exp(-preds))
|
||||||
@@ -116,13 +117,13 @@
|
|||||||
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||||
#' return(list(metric = "error", value = err))
|
#' return(list(metric = "error", value = err))
|
||||||
#' }
|
#' }
|
||||||
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
|
#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
|
||||||
|
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
|
||||||
xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
|
xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
|
||||||
obj = NULL, feval = NULL, verbose = 1, printEveryN=1L,
|
obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
|
||||||
early_stop_round = NULL, early.stop.round = NULL,
|
early.stop.round = NULL, maximize = NULL,
|
||||||
maximize = NULL, ...) {
|
save_period = 0, save_name = "xgboost.model", ...) {
|
||||||
dtrain <- data
|
dtrain <- data
|
||||||
if (typeof(params) != "list") {
|
if (typeof(params) != "list") {
|
||||||
stop("xgb.train: first argument params must be list")
|
stop("xgb.train: first argument params must be list")
|
||||||
@@ -137,14 +138,34 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
|
|||||||
}
|
}
|
||||||
if (length(watchlist) != 0 && verbose == 0) {
|
if (length(watchlist) != 0 && verbose == 0) {
|
||||||
warning('watchlist is provided but verbose=0, no evaluation information will be printed')
|
warning('watchlist is provided but verbose=0, no evaluation information will be printed')
|
||||||
watchlist <- list()
|
|
||||||
}
|
}
|
||||||
params = append(params, list(...))
|
|
||||||
|
fit.call <- match.call()
|
||||||
|
dot.params <- list(...)
|
||||||
|
nms.params <- names(params)
|
||||||
|
nms.dot.params <- names(dot.params)
|
||||||
|
if (length(intersect(nms.params,nms.dot.params)) > 0)
|
||||||
|
stop("Duplicated term in parameters. Please check your list of params.")
|
||||||
|
params <- append(params, dot.params)
|
||||||
|
|
||||||
|
# customized objective and evaluation metric interface
|
||||||
|
if (!is.null(params$objective) && !is.null(obj))
|
||||||
|
stop("xgb.train: cannot assign two different objectives")
|
||||||
|
if (!is.null(params$objective))
|
||||||
|
if (class(params$objective) == 'function') {
|
||||||
|
obj <- params$objective
|
||||||
|
params$objective <- NULL
|
||||||
|
}
|
||||||
|
if (!is.null(params$eval_metric) && !is.null(feval))
|
||||||
|
stop("xgb.train: cannot assign two different evaluation metrics")
|
||||||
|
if (!is.null(params$eval_metric))
|
||||||
|
if (class(params$eval_metric) == 'function') {
|
||||||
|
feval <- params$eval_metric
|
||||||
|
params$eval_metric <- NULL
|
||||||
|
}
|
||||||
|
|
||||||
# Early stopping
|
# Early stopping
|
||||||
if (is.null(early_stop_round) && !is.null(early.stop.round))
|
if (!is.null(early.stop.round)){
|
||||||
early_stop_round = early.stop.round
|
|
||||||
if (!is.null(early_stop_round)){
|
|
||||||
if (!is.null(feval) && is.null(maximize))
|
if (!is.null(feval) && is.null(maximize))
|
||||||
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
|
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
|
||||||
if (length(watchlist) == 0)
|
if (length(watchlist) == 0)
|
||||||
@@ -154,55 +175,63 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
|
|||||||
if (is.null(maximize))
|
if (is.null(maximize))
|
||||||
{
|
{
|
||||||
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
|
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
|
||||||
maximize = FALSE
|
maximize <- FALSE
|
||||||
} else {
|
} else {
|
||||||
maximize = TRUE
|
maximize <- TRUE
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maximize) {
|
if (maximize) {
|
||||||
bestScore = 0
|
bestScore <- 0
|
||||||
} else {
|
} else {
|
||||||
bestScore = Inf
|
bestScore <- Inf
|
||||||
}
|
}
|
||||||
bestInd = 0
|
bestInd <- 0
|
||||||
earlyStopflag = FALSE
|
earlyStopflag = FALSE
|
||||||
|
|
||||||
if (length(watchlist)>1)
|
if (length(watchlist) > 1)
|
||||||
warning('Only the first data set in watchlist is used for early stopping process.')
|
warning('Only the first data set in watchlist is used for early stopping process.')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
handle <- xgb.Booster(params, append(watchlist, dtrain))
|
handle <- xgb.Booster(params, append(watchlist, dtrain))
|
||||||
bst <- xgb.handleToBooster(handle)
|
bst <- xgb.handleToBooster(handle)
|
||||||
printEveryN=max( as.integer(printEveryN), 1L)
|
print.every.n <- max( as.integer(print.every.n), 1L)
|
||||||
for (i in 1:nrounds) {
|
for (i in 1:nrounds) {
|
||||||
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
|
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
|
||||||
if (length(watchlist) != 0) {
|
if (length(watchlist) != 0) {
|
||||||
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
|
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
|
||||||
if (0== ( (i-1) %% printEveryN))
|
if (0 == ( (i - 1) %% print.every.n))
|
||||||
cat(paste(msg, "\n", sep=""))
|
cat(paste(msg, "\n", sep = ""))
|
||||||
if (!is.null(early_stop_round))
|
if (!is.null(early.stop.round))
|
||||||
{
|
{
|
||||||
score = strsplit(msg,':|\\s+')[[1]][3]
|
score <- strsplit(msg,':|\\s+')[[1]][3]
|
||||||
score = as.numeric(score)
|
score <- as.numeric(score)
|
||||||
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
|
if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
|
||||||
bestScore = score
|
bestScore <- score
|
||||||
bestInd = i
|
bestInd <- i
|
||||||
} else {
|
} else {
|
||||||
if (i-bestInd>=early_stop_round) {
|
earlyStopflag = TRUE
|
||||||
earlyStopflag = TRUE
|
if (i - bestInd >= early.stop.round) {
|
||||||
cat('Stopping. Best iteration:',bestInd)
|
cat('Stopping. Best iteration:', bestInd, '\n')
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (save_period > 0) {
|
||||||
|
if (i %% save_period == 0) {
|
||||||
|
xgb.save(bst, save_name)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
bst <- xgb.Booster.check(bst)
|
bst <- xgb.Booster.check(bst)
|
||||||
if (!is.null(early_stop_round)) {
|
|
||||||
bst$bestScore = bestScore
|
if (!is.null(early.stop.round)) {
|
||||||
bst$bestInd = bestInd
|
bst$bestScore <- bestScore
|
||||||
|
bst$bestInd <- bestInd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
attr(bst, "call") <- fit.call
|
||||||
|
attr(bst, "params") <- params
|
||||||
return(bst)
|
return(bst)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,15 +28,17 @@
|
|||||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||||
#' information of performance. If 2, xgboost will print information of both
|
#' information of performance. If 2, xgboost will print information of both
|
||||||
#' performance and construction progress information
|
#' performance and construction progress information
|
||||||
#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
|
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
|
||||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||||
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
|
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
|
||||||
#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered.
|
#' @param weight a vector indicating the weight for each row of the input.
|
||||||
|
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
|
||||||
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
#' If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||||
#' keeps getting worse consecutively for \code{k} rounds.
|
#' keeps getting worse consecutively for \code{k} rounds.
|
||||||
#' @param early.stop.round An alternative of \code{early_stop_round}.
|
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
|
||||||
#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
|
|
||||||
#' \code{maximize=TRUE} means the larger the evaluation score the better.
|
#' \code{maximize=TRUE} means the larger the evaluation score the better.
|
||||||
|
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
|
||||||
|
#' @param save_name the name or path for periodically saved model file.
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
@@ -56,15 +58,11 @@
|
|||||||
#' pred <- predict(bst, test$data)
|
#' pred <- predict(bst, test$data)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
#'
|
xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
|
||||||
xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds,
|
params = list(), nrounds,
|
||||||
verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL,
|
verbose = 1, print.every.n = 1L, early.stop.round = NULL,
|
||||||
maximize = NULL, ...) {
|
maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
|
||||||
if (is.null(missing)) {
|
dtrain <- xgb.get.DMatrix(data, label, missing, weight)
|
||||||
dtrain <- xgb.get.DMatrix(data, label)
|
|
||||||
} else {
|
|
||||||
dtrain <- xgb.get.DMatrix(data, label, missing)
|
|
||||||
}
|
|
||||||
|
|
||||||
params <- append(params, list(...))
|
params <- append(params, list(...))
|
||||||
|
|
||||||
@@ -74,14 +72,12 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
|
|||||||
watchlist <- list()
|
watchlist <- list()
|
||||||
}
|
}
|
||||||
|
|
||||||
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, printEveryN=printEveryN,
|
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
|
||||||
early_stop_round = early_stop_round,
|
early.stop.round = early.stop.round, maximize = maximize,
|
||||||
early.stop.round = early.stop.round)
|
save_period = save_period, save_name = save_name)
|
||||||
|
|
||||||
return(bst)
|
return(bst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#' Training part from Mushroom Data Set
|
#' Training part from Mushroom Data Set
|
||||||
#'
|
#'
|
||||||
#' This data set is originally from the Mushroom data set,
|
#' This data set is originally from the Mushroom data set,
|
||||||
|
|||||||
@@ -1,20 +1,44 @@
|
|||||||
# R package for xgboost.
|
R package for xgboost
|
||||||
|
=====================
|
||||||
|
|
||||||
## Installation
|
[](http://cran.r-project.org/web/packages/xgboost)
|
||||||
|
[](http://cran.rstudio.com/web/packages/xgboost/index.html)
|
||||||
|
|
||||||
For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
```r
|
We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now. For stable/pre-compiled(for Windows and OS X) version, please install from CRAN:
|
||||||
devtools::install_github('dmlc/xgboost',subdir='R-package')
|
|
||||||
```
|
|
||||||
|
|
||||||
For stable version on CRAN, please run
|
|
||||||
|
|
||||||
```r
|
```r
|
||||||
install.packages('xgboost')
|
install.packages('xgboost')
|
||||||
```
|
```
|
||||||
|
|
||||||
## Examples
|
For up-to-date version, please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
|
||||||
|
|
||||||
|
```r
|
||||||
|
devtools::install_github('dmlc/xgboost',subdir='R-package')
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
* Please visit [walk through example](demo).
|
* Please visit [walk through example](demo).
|
||||||
* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
|
* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
|
||||||
|
If you face an issue installing the package using ```devtools::install_github```, something like this (even after updating libxml and RCurl as lot of forums say) -
|
||||||
|
|
||||||
|
```
|
||||||
|
devtools::install_github('dmlc/xgboost',subdir='R-package')
|
||||||
|
Downloading github repo dmlc/xgboost@master
|
||||||
|
Error in function (type, msg, asError = TRUE) :
|
||||||
|
Peer certificate cannot be authenticated with given CA certificates
|
||||||
|
```
|
||||||
|
To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) -
|
||||||
|
```
|
||||||
|
1. Clone the current repository and set your workspace to xgboost/R-package/
|
||||||
|
2. Run R CMD INSTALL --build . in terminal to get the tarball.
|
||||||
|
3. Run install.packages('path_to_the_tarball',repo=NULL) in R to install.
|
||||||
|
```
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
basic_walkthrough Basic feature walkthrough
|
basic_walkthrough Basic feature walkthrough
|
||||||
|
caret_wrapper Use xgboost to train in caret library
|
||||||
custom_objective Cutomize loss function, and evaluation metric
|
custom_objective Cutomize loss function, and evaluation metric
|
||||||
boost_from_prediction Boosting from existing prediction
|
boost_from_prediction Boosting from existing prediction
|
||||||
predict_first_ntree Predicting using first n trees
|
predict_first_ntree Predicting using first n trees
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
XGBoost R Feature Walkthrough
|
XGBoost R Feature Walkthrough
|
||||||
====
|
====
|
||||||
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
||||||
|
* [Train a xgboost model from caret library](caret_wrapper.R)
|
||||||
* [Cutomize loss function, and evaluation metric](custom_objective.R)
|
* [Cutomize loss function, and evaluation metric](custom_objective.R)
|
||||||
* [Boosting from existing prediction](boost_from_prediction.R)
|
* [Boosting from existing prediction](boost_from_prediction.R)
|
||||||
* [Predicting using first n trees](predict_first_ntree.R)
|
* [Predicting using first n trees](predict_first_ntree.R)
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(methods)
|
require(methods)
|
||||||
# we load in the agaricus dataset
|
# we load in the agaricus dataset
|
||||||
# In this example, we are aiming to predict whether a mushroom can be eated
|
# In this example, we are aiming to predict whether a mushroom can be eaten
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package='xgboost')
|
||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
@@ -12,30 +12,30 @@ class(train$data)
|
|||||||
|
|
||||||
#-------------Basic Training using XGBoost-----------------
|
#-------------Basic Training using XGBoost-----------------
|
||||||
# this is the basic usage of xgboost you can put matrix in data field
|
# this is the basic usage of xgboost you can put matrix in data field
|
||||||
# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
|
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
|
||||||
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
|
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
|
||||||
print("training xgboost with sparseMatrix")
|
print("Training xgboost with sparseMatrix")
|
||||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
|
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||||
nthread = 2, objective = "binary:logistic")
|
nthread = 2, objective = "binary:logistic")
|
||||||
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
||||||
print("training xgboost with Matrix")
|
print("Training xgboost with Matrix")
|
||||||
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
|
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||||
nthread = 2, objective = "binary:logistic")
|
nthread = 2, objective = "binary:logistic")
|
||||||
|
|
||||||
# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
|
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
|
||||||
print("training xgboost with xgb.DMatrix")
|
print("Training xgboost with xgb.DMatrix")
|
||||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2,
|
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2,
|
||||||
objective = "binary:logistic")
|
objective = "binary:logistic")
|
||||||
|
|
||||||
# Verbose = 0,1,2
|
# Verbose = 0,1,2
|
||||||
print ('train xgboost with verbose 0, no message')
|
print("Train xgboost with verbose 0, no message")
|
||||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||||
nthread = 2, objective = "binary:logistic", verbose = 0)
|
nthread = 2, objective = "binary:logistic", verbose = 0)
|
||||||
print ('train xgboost with verbose 1, print evaluation metric')
|
print("Train xgboost with verbose 1, print evaluation metric")
|
||||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||||
nthread = 2, objective = "binary:logistic", verbose = 1)
|
nthread = 2, objective = "binary:logistic", verbose = 1)
|
||||||
print ('train xgboost with verbose 2, also print information about tree')
|
print("Train xgboost with verbose 2, also print information about tree")
|
||||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||||
nthread = 2, objective = "binary:logistic", verbose = 2)
|
nthread = 2, objective = "binary:logistic", verbose = 2)
|
||||||
|
|
||||||
@@ -72,15 +72,15 @@ print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
|||||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
||||||
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
||||||
#---------------Using watchlist----------------
|
#---------------Using watchlist----------------
|
||||||
# watchlist is a list of xgb.DMatrix, each of them tagged with name
|
# watchlist is a list of xgb.DMatrix, each of them is tagged with name
|
||||||
watchlist <- list(train=dtrain, test=dtest)
|
watchlist <- list(train=dtrain, test=dtest)
|
||||||
# to train with watchlist, use xgb.train, which contains more advanced features
|
# to train with watchlist, use xgb.train, which contains more advanced features
|
||||||
# watchlist allows us to monitor the evaluation result on all data in the list
|
# watchlist allows us to monitor the evaluation result on all data in the list
|
||||||
print ('train xgboost using xgb.train with watchlist')
|
print("Train xgboost using xgb.train with watchlist")
|
||||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||||
nthread = 2, objective = "binary:logistic")
|
nthread = 2, objective = "binary:logistic")
|
||||||
# we can change evaluation metrics, or use multiple evaluation metrics
|
# we can change evaluation metrics, or use multiple evaluation metrics
|
||||||
print ('train xgboost using xgb.train with watchlist, watch logloss and error')
|
print("train xgboost using xgb.train with watchlist, watch logloss and error")
|
||||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||||
eval.metric = "error", eval.metric = "logloss",
|
eval.metric = "error", eval.metric = "logloss",
|
||||||
nthread = 2, objective = "binary:logistic")
|
nthread = 2, objective = "binary:logistic")
|
||||||
@@ -102,4 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T)
|
|||||||
|
|
||||||
# Finally, you can check which features are the most important.
|
# Finally, you can check which features are the most important.
|
||||||
print("Most important features (look at column Gain):")
|
print("Most important features (look at column Gain):")
|
||||||
print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt"))
|
imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst)
|
||||||
|
print(imp_matrix)
|
||||||
|
|
||||||
|
# Feature importance bar plot by gain
|
||||||
|
print("Feature importance Plot : ")
|
||||||
|
print(xgb.plot.importance(importance_matrix = imp_matrix))
|
||||||
|
|||||||
@@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain)
|
|||||||
setinfo(dtest, "base_margin", ptest)
|
setinfo(dtest, "base_margin", ptest)
|
||||||
|
|
||||||
print('this is result of boost from initial prediction')
|
print('this is result of boost from initial prediction')
|
||||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)
|
||||||
|
|||||||
35
R-package/demo/caret_wrapper.R
Normal file
35
R-package/demo/caret_wrapper.R
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# install development version of caret library that contains xgboost models
|
||||||
|
devtools::install_github("topepo/caret/pkg/caret")
|
||||||
|
require(caret)
|
||||||
|
require(xgboost)
|
||||||
|
require(data.table)
|
||||||
|
require(vcd)
|
||||||
|
require(e1071)
|
||||||
|
|
||||||
|
# Load Arthritis dataset in memory.
|
||||||
|
data(Arthritis)
|
||||||
|
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||||
|
df <- data.table(Arthritis, keep.rownames = F)
|
||||||
|
|
||||||
|
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
||||||
|
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||||
|
df[,AgeDiscret:= as.factor(round(Age/10,0))]
|
||||||
|
|
||||||
|
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
|
||||||
|
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||||
|
|
||||||
|
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||||
|
df[,ID:=NULL]
|
||||||
|
|
||||||
|
#-------------Basic Training using XGBoost in caret Library-----------------
|
||||||
|
# Set up control parameters for caret::train
|
||||||
|
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
|
||||||
|
fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
|
||||||
|
# train a xgbTree model using caret::train
|
||||||
|
model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
|
||||||
|
|
||||||
|
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear
|
||||||
|
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
|
||||||
|
|
||||||
|
# See model results
|
||||||
|
print(model)
|
||||||
@@ -1,11 +1,13 @@
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(Matrix)
|
require(Matrix)
|
||||||
require(data.table)
|
require(data.table)
|
||||||
if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
|
if (!require(vcd)) {
|
||||||
|
install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
|
||||||
|
require(vcd)
|
||||||
|
}
|
||||||
# According to its documentation, Xgboost works only on numbers.
|
# According to its documentation, Xgboost works only on numbers.
|
||||||
# Sometimes the dataset we have to work on have categorical data.
|
# Sometimes the dataset we have to work on have categorical data.
|
||||||
# A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
|
# A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
|
||||||
#
|
#
|
||||||
# In R, categorical variable is called Factor.
|
# In R, categorical variable is called Factor.
|
||||||
# Type ?factor in console for more information.
|
# Type ?factor in console for more information.
|
||||||
@@ -65,18 +67,17 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
|
|||||||
cat("Learning...\n")
|
cat("Learning...\n")
|
||||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
||||||
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
|
||||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
|
||||||
|
|
||||||
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
|
||||||
print(importance)
|
print(importance)
|
||||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
||||||
|
|
||||||
# Does these results make sense?
|
# Does these result make sense?
|
||||||
# Let's check some Chi2 between each of these features and the outcome.
|
# Let's check some Chi2 between each of these features and the outcome.
|
||||||
|
|
||||||
print(chisq.test(df$Age, df$Y))
|
print(chisq.test(df$Age, df$Y))
|
||||||
# Pearson correlation between Age and illness disapearing is 35
|
# Pearson correlation between Age and illness disappearing is 35
|
||||||
|
|
||||||
print(chisq.test(df$AgeDiscret, df$Y))
|
print(chisq.test(df$AgeDiscret, df$Y))
|
||||||
# Our first simplification of Age gives a Pearson correlation of 8.
|
# Our first simplification of Age gives a Pearson correlation of 8.
|
||||||
@@ -84,6 +85,6 @@ print(chisq.test(df$AgeDiscret, df$Y))
|
|||||||
print(chisq.test(df$AgeCat, df$Y))
|
print(chisq.test(df$AgeCat, df$Y))
|
||||||
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
|
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
|
||||||
|
|
||||||
# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
|
# As you can see, in general destroying information by simplifying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
|
||||||
# However it's almost always worse when you add some arbitrary rules.
|
# However it's almost always worse when you add some arbitrary rules.
|
||||||
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
|
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
|
||||||
|
|||||||
@@ -40,12 +40,12 @@ evalerror <- function(preds, dtrain) {
|
|||||||
return(list(metric = "error", value = err))
|
return(list(metric = "error", value = err))
|
||||||
}
|
}
|
||||||
|
|
||||||
param <- list(max.depth=2,eta=1,silent=1)
|
param <- list(max.depth=2,eta=1,silent=1,
|
||||||
|
objective = logregobj, eval_metric = evalerror)
|
||||||
# train with customized objective
|
# train with customized objective
|
||||||
xgb.cv(param, dtrain, nround, nfold = 5,
|
xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)
|
||||||
obj = logregobj, feval=evalerror)
|
|
||||||
|
|
||||||
# do cross validation with prediction values for each fold
|
# do cross validation with prediction values for each fold
|
||||||
res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
|
res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
|
||||||
res$dt
|
res$dt
|
||||||
length(res$pred)
|
length(res$pred)
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
|||||||
# note: for customized objective function, we leave objective as default
|
# note: for customized objective function, we leave objective as default
|
||||||
# note: what we are getting is margin value in prediction
|
# note: what we are getting is margin value in prediction
|
||||||
# you must know what you are doing
|
# you must know what you are doing
|
||||||
param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
|
|
||||||
watchlist <- list(eval = dtest, train = dtrain)
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
num_round <- 2
|
num_round <- 2
|
||||||
|
|
||||||
@@ -33,10 +32,13 @@ evalerror <- function(preds, dtrain) {
|
|||||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||||
return(list(metric = "error", value = err))
|
return(list(metric = "error", value = err))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
|
||||||
|
objective=logregobj, eval_metric=evalerror)
|
||||||
print ('start training with user customized objective')
|
print ('start training with user customized objective')
|
||||||
# training with customized objective, we can also do step by step training
|
# training with customized objective, we can also do step by step training
|
||||||
# simply look at xgboost.py's implementation of train
|
# simply look at xgboost.py's implementation of train
|
||||||
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|
||||||
#
|
#
|
||||||
# there can be cases where you want additional information
|
# there can be cases where you want additional information
|
||||||
@@ -55,8 +57,9 @@ logregobjattr <- function(preds, dtrain) {
|
|||||||
hess <- preds * (1 - preds)
|
hess <- preds * (1 - preds)
|
||||||
return(list(grad = grad, hess = hess))
|
return(list(grad = grad, hess = hess))
|
||||||
}
|
}
|
||||||
|
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
|
||||||
|
objective=logregobjattr, eval_metric=evalerror)
|
||||||
print ('start training with user customized objective, with additional attributes in DMatrix')
|
print ('start training with user customized objective, with additional attributes in DMatrix')
|
||||||
# training with customized objective, we can also do step by step training
|
# training with customized objective, we can also do step by step training
|
||||||
# simply look at xgboost.py's implementation of train
|
# simply look at xgboost.py's implementation of train
|
||||||
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror)
|
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|||||||
@@ -31,9 +31,10 @@ evalerror <- function(preds, dtrain) {
|
|||||||
return(list(metric = "error", value = err))
|
return(list(metric = "error", value = err))
|
||||||
}
|
}
|
||||||
print ('start training with early Stopping setting')
|
print ('start training with early Stopping setting')
|
||||||
# training with customized objective, we can also do step by step training
|
|
||||||
# simply look at xgboost.py's implementation of train
|
bst <- xgb.train(param, dtrain, num_round, watchlist,
|
||||||
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE,
|
objective = logregobj, eval_metric = evalerror, maximize = FALSE,
|
||||||
early.stop.round = 3)
|
early.stop.round = 3)
|
||||||
bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror,
|
bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
|
||||||
|
objective = logregobj, eval_metric = evalerror,
|
||||||
maximize = FALSE, early.stop.round = 3)
|
maximize = FALSE, early.stop.round = 3)
|
||||||
|
|||||||
@@ -1,21 +1,52 @@
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
|
require(data.table)
|
||||||
|
require(Matrix)
|
||||||
|
|
||||||
|
set.seed(1982)
|
||||||
|
|
||||||
# load in the agaricus dataset
|
# load in the agaricus dataset
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package='xgboost')
|
||||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||||
|
|
||||||
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
|
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||||
watchlist <- list(eval = dtest, train = dtrain)
|
nround = 4
|
||||||
nround = 5
|
|
||||||
|
|
||||||
# training the model for two rounds
|
# training the model for two rounds
|
||||||
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
|
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||||
cat('start testing prediction from first n trees\n')
|
|
||||||
|
# Model accuracy without new features
|
||||||
|
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||||
|
|
||||||
### predict using first 2 tree
|
|
||||||
pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
|
|
||||||
head(pred_with_leaf)
|
|
||||||
# by default, we predict using all the trees
|
# by default, we predict using all the trees
|
||||||
|
|
||||||
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
|
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
|
||||||
head(pred_with_leaf)
|
head(pred_with_leaf)
|
||||||
|
|
||||||
|
create.new.tree.features <- function(model, original.features){
|
||||||
|
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
|
||||||
|
cols <- list()
|
||||||
|
for(i in 1:length(trees)){
|
||||||
|
# max is not the real max but it s not important for the purpose of adding features
|
||||||
|
leaf.id <- sort(unique(pred_with_leaf[,i]))
|
||||||
|
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
|
||||||
|
}
|
||||||
|
cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert previous features to one hot encoding
|
||||||
|
new.features.train <- create.new.tree.features(bst, agaricus.train$data)
|
||||||
|
new.features.test <- create.new.tree.features(bst, agaricus.test$data)
|
||||||
|
|
||||||
|
# learning with new features
|
||||||
|
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||||
|
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||||
|
watchlist <- list(train = new.dtrain)
|
||||||
|
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||||
|
|
||||||
|
# Model accuracy with new features
|
||||||
|
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||||
|
|
||||||
|
# Here the accuracy was already good and is now perfect.
|
||||||
|
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
|
||||||
|
|||||||
@@ -9,3 +9,4 @@ demo(create_sparse_matrix)
|
|||||||
demo(predict_leaf_indices)
|
demo(predict_leaf_indices)
|
||||||
demo(early_stopping)
|
demo(early_stopping)
|
||||||
demo(poisson_regression)
|
demo(poisson_regression)
|
||||||
|
demo(caret_wrapper)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgboost.R
|
% Please edit documentation in R/xgboost.R
|
||||||
\docType{data}
|
\docType{data}
|
||||||
\name{agaricus.test}
|
\name{agaricus.test}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgboost.R
|
% Please edit documentation in R/xgboost.R
|
||||||
\docType{data}
|
\docType{data}
|
||||||
\name{agaricus.train}
|
\name{agaricus.train}
|
||||||
|
|||||||
15
R-package/man/edge.parser.Rd
Normal file
15
R-package/man/edge.parser.Rd
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.plot.deepness.R
|
||||||
|
\name{edge.parser}
|
||||||
|
\alias{edge.parser}
|
||||||
|
\title{Parse the graph to extract vector of edges}
|
||||||
|
\usage{
|
||||||
|
edge.parser(element)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{element}{igraph object containing the path from the root to the leaf.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Parse the graph to extract vector of edges
|
||||||
|
}
|
||||||
|
|
||||||
15
R-package/man/get.paths.to.leaf.Rd
Normal file
15
R-package/man/get.paths.to.leaf.Rd
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.plot.deepness.R
|
||||||
|
\name{get.paths.to.leaf}
|
||||||
|
\alias{get.paths.to.leaf}
|
||||||
|
\title{Extract path from root to leaf from data.table}
|
||||||
|
\usage{
|
||||||
|
get.paths.to.leaf(dt.tree)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dt.tree}{data.table containing the nodes and edges of the trees}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Extract path from root to leaf from data.table
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/getinfo.xgb.DMatrix.R
|
% Please edit documentation in R/getinfo.xgb.DMatrix.R
|
||||||
\docType{methods}
|
\docType{methods}
|
||||||
\name{getinfo}
|
\name{getinfo}
|
||||||
|
|||||||
15
R-package/man/multiplot.Rd
Normal file
15
R-package/man/multiplot.Rd
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.plot.deepness.R
|
||||||
|
\name{multiplot}
|
||||||
|
\alias{multiplot}
|
||||||
|
\title{Plot multiple graphs at the same time}
|
||||||
|
\usage{
|
||||||
|
multiplot(..., cols = 1)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{cols}{number of columns}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Plot multiple graph aligned by rows and columns.
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/nrow.xgb.DMatrix.R
|
% Please edit documentation in R/nrow.xgb.DMatrix.R
|
||||||
\docType{methods}
|
\docType{methods}
|
||||||
\name{nrow,xgb.DMatrix-method}
|
\name{nrow,xgb.DMatrix-method}
|
||||||
@@ -18,5 +18,6 @@ data(agaricus.train, package='xgboost')
|
|||||||
train <- agaricus.train
|
train <- agaricus.train
|
||||||
dtrain <- xgb.DMatrix(train$data, label=train$label)
|
dtrain <- xgb.DMatrix(train$data, label=train$label)
|
||||||
stopifnot(nrow(dtrain) == nrow(train$data))
|
stopifnot(nrow(dtrain) == nrow(train$data))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/predict.xgb.Booster.R
|
% Please edit documentation in R/predict.xgb.Booster.R
|
||||||
\docType{methods}
|
\docType{methods}
|
||||||
\name{predict,xgb.Booster-method}
|
\name{predict,xgb.Booster-method}
|
||||||
\alias{predict,xgb.Booster-method}
|
\alias{predict,xgb.Booster-method}
|
||||||
\title{Predict method for eXtreme Gradient Boosting model}
|
\title{Predict method for eXtreme Gradient Boosting model}
|
||||||
\usage{
|
\usage{
|
||||||
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
|
\S4method{predict}{xgb.Booster}(object, newdata, missing = NA,
|
||||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
|
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
@@ -31,6 +31,16 @@ than 0. It will use all trees by default.}
|
|||||||
\description{
|
\description{
|
||||||
Predicted values based on xgboost model object.
|
Predicted values based on xgboost model object.
|
||||||
}
|
}
|
||||||
|
\details{
|
||||||
|
The option \code{ntreelimit} purpose is to let the user train a model with lots
|
||||||
|
of trees but use only the first trees for prediction to avoid overfitting
|
||||||
|
(without having to train a new model with less trees).
|
||||||
|
|
||||||
|
The option \code{predleaf} purpose is inspired from §3.1 of the paper
|
||||||
|
\code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
|
||||||
|
The idea is to use the model as a generator of new features which capture non linear link
|
||||||
|
from original features.
|
||||||
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package='xgboost')
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/predict.xgb.Booster.handle.R
|
% Please edit documentation in R/predict.xgb.Booster.handle.R
|
||||||
\docType{methods}
|
\docType{methods}
|
||||||
\name{predict,xgb.Booster.handle-method}
|
\name{predict,xgb.Booster.handle-method}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/setinfo.xgb.DMatrix.R
|
% Please edit documentation in R/setinfo.xgb.DMatrix.R
|
||||||
\docType{methods}
|
\docType{methods}
|
||||||
\name{setinfo}
|
\name{setinfo}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/slice.xgb.DMatrix.R
|
% Please edit documentation in R/slice.xgb.DMatrix.R
|
||||||
\docType{methods}
|
\docType{methods}
|
||||||
\name{slice}
|
\name{slice}
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.DMatrix.R
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
\name{xgb.DMatrix}
|
\name{xgb.DMatrix}
|
||||||
\alias{xgb.DMatrix}
|
\alias{xgb.DMatrix}
|
||||||
\title{Contruct xgb.DMatrix object}
|
\title{Contruct xgb.DMatrix object}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.DMatrix(data, info = list(), missing = 0, ...)
|
xgb.DMatrix(data, info = list(), missing = NA, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
|
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.DMatrix.save.R
|
% Please edit documentation in R/xgb.DMatrix.save.R
|
||||||
\name{xgb.DMatrix.save}
|
\name{xgb.DMatrix.save}
|
||||||
\alias{xgb.DMatrix.save}
|
\alias{xgb.DMatrix.save}
|
||||||
|
|||||||
88
R-package/man/xgb.create.features.Rd
Normal file
88
R-package/man/xgb.create.features.Rd
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.create.features.R
|
||||||
|
\name{xgb.create.features}
|
||||||
|
\alias{xgb.create.features}
|
||||||
|
\title{Create new features from a previously learned model}
|
||||||
|
\usage{
|
||||||
|
xgb.create.features(model, training.data)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{model}{decision tree boosting model learned on the original data}
|
||||||
|
|
||||||
|
\item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
\code{dgCMatrix} matrix including both the original data and the new features.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
This is the function inspired from the paragraph 3.1 of the paper:
|
||||||
|
|
||||||
|
\strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
|
||||||
|
|
||||||
|
\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
|
||||||
|
Joaquin Quiñonero Candela)}
|
||||||
|
|
||||||
|
International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
|
||||||
|
|
||||||
|
\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
|
||||||
|
|
||||||
|
Extract explaining the method:
|
||||||
|
|
||||||
|
"\emph{We found that boosted decision trees are a powerful and very
|
||||||
|
convenient way to implement non-linear and tuple transformations
|
||||||
|
of the kind we just described. We treat each individual
|
||||||
|
tree as a categorical feature that takes as value the
|
||||||
|
index of the leaf an instance ends up falling in. We use
|
||||||
|
1-of-K coding of this type of features.
|
||||||
|
|
||||||
|
For example, consider the boosted tree model in Figure 1 with 2 subtrees,
|
||||||
|
where the first subtree has 3 leafs and the second 2 leafs. If an
|
||||||
|
instance ends up in leaf 2 in the first subtree and leaf 1 in
|
||||||
|
second subtree, the overall input to the linear classifier will
|
||||||
|
be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
|
||||||
|
correspond to the leaves of the first subtree and last 2 to
|
||||||
|
those of the second subtree.
|
||||||
|
|
||||||
|
[...]
|
||||||
|
|
||||||
|
We can understand boosted decision tree
|
||||||
|
based transformation as a supervised feature encoding that
|
||||||
|
converts a real-valued vector into a compact binary-valued
|
||||||
|
vector. A traversal from root node to a leaf node represents
|
||||||
|
a rule on certain features.}"
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
data(agaricus.test, package='xgboost')
|
||||||
|
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||||
|
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||||
|
|
||||||
|
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||||
|
nround = 4
|
||||||
|
|
||||||
|
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||||
|
|
||||||
|
# Model accuracy without new features
|
||||||
|
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||||
|
|
||||||
|
# Convert previous features to one hot encoding
|
||||||
|
new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
|
||||||
|
new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
|
||||||
|
|
||||||
|
# learning with new features
|
||||||
|
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||||
|
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||||
|
watchlist <- list(train = new.dtrain)
|
||||||
|
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||||
|
|
||||||
|
# Model accuracy with new features
|
||||||
|
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
||||||
|
|
||||||
|
# Here the accuracy was already good and is now perfect.
|
||||||
|
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,14 +1,13 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.cv.R
|
% Please edit documentation in R/xgb.cv.R
|
||||||
\name{xgb.cv}
|
\name{xgb.cv}
|
||||||
\alias{xgb.cv}
|
\alias{xgb.cv}
|
||||||
\title{Cross Validation}
|
\title{Cross Validation}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||||
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
|
prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL,
|
||||||
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
|
feval = NULL, stratified = TRUE, folds = NULL, verbose = T,
|
||||||
verbose = T, early_stop_round = NULL, early.stop.round = NULL,
|
print.every.n = 1L, early.stop.round = NULL, maximize = NULL, ...)
|
||||||
maximize = NULL, ...)
|
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters. Commonly used ones are:
|
\item{params}{the list of parameters. Commonly used ones are:
|
||||||
@@ -41,7 +40,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu
|
|||||||
|
|
||||||
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
|
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
|
||||||
|
|
||||||
\item{metrics,}{list of evaluation metrics to be used in corss validation,
|
\item{metrics, }{list of evaluation metrics to be used in corss validation,
|
||||||
when it is not specified, the evaluation metric is chosen according to objective function.
|
when it is not specified, the evaluation metric is chosen according to objective function.
|
||||||
Possible options are:
|
Possible options are:
|
||||||
\itemize{
|
\itemize{
|
||||||
@@ -66,14 +65,14 @@ If folds are supplied, the nfold and stratified parameters would be ignored.}
|
|||||||
|
|
||||||
\item{verbose}{\code{boolean}, print the statistics during the process}
|
\item{verbose}{\code{boolean}, print the statistics during the process}
|
||||||
|
|
||||||
\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
|
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
|
||||||
|
|
||||||
|
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
|
||||||
If set to an integer \code{k}, training with a validation set will stop if the performance
|
If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||||
keeps getting worse consecutively for \code{k} rounds.}
|
keeps getting worse consecutively for \code{k} rounds.}
|
||||||
|
|
||||||
\item{early.stop.round}{An alternative of \code{early_stop_round}.}
|
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
|
||||||
|
\code{maximize=TRUE} means the larger the evaluation score the better.}
|
||||||
\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
|
|
||||||
\code{maximize=TRUE} means the larger the evaluation score the better.}
|
|
||||||
|
|
||||||
\item{...}{other parameters to pass to \code{params}.}
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.dump.R
|
% Please edit documentation in R/xgb.dump.R
|
||||||
\name{xgb.dump}
|
\name{xgb.dump}
|
||||||
\alias{xgb.dump}
|
\alias{xgb.dump}
|
||||||
@@ -19,9 +19,9 @@ See demo/ for walkthrough example in R, and
|
|||||||
for example Format.}
|
for example Format.}
|
||||||
|
|
||||||
\item{with.stats}{whether dump statistics of splits
|
\item{with.stats}{whether dump statistics of splits
|
||||||
When this option is on, the model dump comes with two additional statistics:
|
When this option is on, the model dump comes with two additional statistics:
|
||||||
gain is the approximate loss function gain we get in each split;
|
gain is the approximate loss function gain we get in each split;
|
||||||
cover is the sum of second order gradient in each node.}
|
cover is the sum of second order gradient in each node.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||||
|
|||||||
@@ -1,18 +1,16 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.importance.R
|
% Please edit documentation in R/xgb.importance.R
|
||||||
\name{xgb.importance}
|
\name{xgb.importance}
|
||||||
\alias{xgb.importance}
|
\alias{xgb.importance}
|
||||||
\title{Show importance of features in a model}
|
\title{Show importance of features in a model}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
|
xgb.importance(feature_names = NULL, model = NULL, data = NULL,
|
||||||
data = NULL, label = NULL, target = function(x) ((x + label) == 2))
|
label = NULL, target = function(x) ((x + label) == 2))
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||||
|
|
||||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
|
\item{model}{generated by the \code{xgb.train} function.}
|
||||||
|
|
||||||
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
|
||||||
|
|
||||||
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
||||||
|
|
||||||
@@ -24,23 +22,24 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
|
|||||||
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Read a xgboost model text dump.
|
Create a \code{data.table} of the most important features of a model.
|
||||||
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
This is the function to understand the model trained (and through your model, your data).
|
This function is for both linear and tree models.
|
||||||
|
|
||||||
Results are returned for both linear and tree models.
|
|
||||||
|
|
||||||
\code{data.table} is returned by the function.
|
\code{data.table} is returned by the function.
|
||||||
There are 3 columns :
|
The columns are :
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
||||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
||||||
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
|
||||||
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
If you don't provide \code{feature_names}, index of the features will be used instead.
|
||||||
|
|
||||||
|
Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
|
||||||
|
|
||||||
Co-occurence count
|
Co-occurence count
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
@@ -53,18 +52,14 @@ If you need to remember one thing only: until you want to leave us early, don't
|
|||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
# Both dataset are list with two items, a sparse matrix and labels
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
# (labels = outcome column which will be learned).
|
|
||||||
# Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
train <- agaricus.train
|
|
||||||
|
|
||||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
|
||||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
|
|
||||||
# train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
xgb.importance(train$data@Dimnames[[2]], model = bst)
|
xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||||
|
|
||||||
# Same thing with co-occurence computation this time
|
# Same thing with co-occurence computation this time
|
||||||
xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label)
|
xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.load.R
|
% Please edit documentation in R/xgb.load.R
|
||||||
\name{xgb.load}
|
\name{xgb.load}
|
||||||
\alias{xgb.load}
|
\alias{xgb.load}
|
||||||
|
|||||||
@@ -1,33 +1,33 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.model.dt.tree.R
|
% Please edit documentation in R/xgb.model.dt.tree.R
|
||||||
\name{xgb.model.dt.tree}
|
\name{xgb.model.dt.tree}
|
||||||
\alias{xgb.model.dt.tree}
|
\alias{xgb.model.dt.tree}
|
||||||
\title{Convert tree model dump to data.table}
|
\title{Parse boosted tree model text dump}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
|
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
|
||||||
model = NULL, text = NULL, n_first_tree = NULL)
|
n_first_tree = NULL)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).}
|
||||||
|
|
||||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
\item{model}{object created by the \code{xgb.train} function.}
|
||||||
|
|
||||||
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).}
|
||||||
|
|
||||||
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.}
|
||||||
|
|
||||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
A \code{data.table} of the features used in the model with their gain, cover and few other thing.
|
A \code{data.table} of the features used in the model with their gain, cover and few other information.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Read a tree model text dump and return a data.table.
|
Parse a boosted tree model text dump and return a \code{data.table}.
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
|
General function to convert a text dump of tree model to a \code{data.table}.
|
||||||
|
|
||||||
The content of the \code{data.table} is organised that way:
|
The purpose is to help user to explore the model and get a better understanding of it.
|
||||||
|
|
||||||
|
The columns of the \code{data.table} are:
|
||||||
|
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{ID}: unique identifier of a node ;
|
\item \code{ID}: unique identifier of a node ;
|
||||||
@@ -39,21 +39,17 @@ The content of the \code{data.table} is organised that way:
|
|||||||
\item \code{Quality}: it's the gain related to the split in this specific node ;
|
\item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||||
\item \code{Cover}: metric to measure the number of observation affected by the split ;
|
\item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||||
\item \code{Tree}: ID of the tree. It is included in the main ID ;
|
\item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||||
\item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
|
\item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
#Both dataset are list with two items, a sparse matrix and labels
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
#(labels = outcome column which will be learned).
|
|
||||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
train <- agaricus.train
|
|
||||||
|
|
||||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
|
||||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
|
|
||||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
46
R-package/man/xgb.plot.deepness.Rd
Normal file
46
R-package/man/xgb.plot.deepness.Rd
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.plot.deepness.R
|
||||||
|
\name{xgb.plot.deepness}
|
||||||
|
\alias{xgb.plot.deepness}
|
||||||
|
\title{Plot model trees deepness}
|
||||||
|
\usage{
|
||||||
|
xgb.plot.deepness(model = NULL)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{model}{dump generated by the \code{xgb.train} function.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
Two graphs showing the distribution of the model deepness.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate a graph to plot the distribution of deepness among trees.
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||||
|
by tree deepness level.
|
||||||
|
|
||||||
|
The purpose of this function is to help the user to find the best trade-off to set
|
||||||
|
the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||||
|
|
||||||
|
See \link{xgb.train} for more information about these parameters.
|
||||||
|
|
||||||
|
The graph is made of two parts:
|
||||||
|
|
||||||
|
\itemize{
|
||||||
|
\item Count: number of leaf per level of deepness;
|
||||||
|
\item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
|
||||||
|
}
|
||||||
|
|
||||||
|
This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
|
||||||
|
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||||
|
min_child_weight = 50)
|
||||||
|
|
||||||
|
xgb.plot.deepness(model = bst)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.plot.importance.R
|
% Please edit documentation in R/xgb.plot.importance.R
|
||||||
\name{xgb.plot.importance}
|
\name{xgb.plot.importance}
|
||||||
\alias{xgb.plot.importance}
|
\alias{xgb.plot.importance}
|
||||||
@@ -15,11 +15,11 @@ xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10))
|
|||||||
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
|
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Read a data.table containing feature importance details and plot it.
|
Read a data.table containing feature importance details and plot it (for both GLM and Trees).
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
The purpose of this function is to easily represent the importance of each feature of a model.
|
The purpose of this function is to easily represent the importance of each feature of a model.
|
||||||
The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
||||||
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
@@ -28,13 +28,13 @@ data(agaricus.train, package='xgboost')
|
|||||||
#Both dataset are list with two items, a sparse matrix and labels
|
#Both dataset are list with two items, a sparse matrix and labels
|
||||||
#(labels = outcome column which will be learned).
|
#(labels = outcome column which will be learned).
|
||||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||||
train <- agaricus.train
|
|
||||||
|
|
||||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
|
|
||||||
#train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
#agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst)
|
importance_matrix <- xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||||
xgb.plot.importance(importance_matrix)
|
xgb.plot.importance(importance_matrix)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
58
R-package/man/xgb.plot.multi.trees.Rd
Normal file
58
R-package/man/xgb.plot.multi.trees.Rd
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.plot.multi.trees.R
|
||||||
|
\name{xgb.plot.multi.trees}
|
||||||
|
\alias{xgb.plot.multi.trees}
|
||||||
|
\title{Project all trees on one tree and plot it}
|
||||||
|
\usage{
|
||||||
|
xgb.plot.multi.trees(model, feature_names = NULL, features.keep = 5,
|
||||||
|
plot.width = NULL, plot.height = NULL)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{model}{dump generated by the \code{xgb.train} function.}
|
||||||
|
|
||||||
|
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||||
|
|
||||||
|
\item{features.keep}{number of features to keep in each position of the multi trees.}
|
||||||
|
|
||||||
|
\item{plot.width}{width in pixels of the graph to produce}
|
||||||
|
|
||||||
|
\item{plot.height}{height in pixels of the graph to produce}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
Two graphs showing the distribution of the model deepness.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Visualization of the ensemble of trees as a single collective unit.
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
This function tries to capture the complexity of gradient boosted tree ensemble
|
||||||
|
in a cohesive way.
|
||||||
|
|
||||||
|
The goal is to improve the interpretability of the model generally seen as black box.
|
||||||
|
The function is dedicated to boosting applied to decision trees only.
|
||||||
|
|
||||||
|
The purpose is to move from an ensemble of trees to a single tree only.
|
||||||
|
|
||||||
|
It takes advantage of the fact that the shape of a binary tree is only defined by
|
||||||
|
its deepness (therefore in a boosting model, all trees have the same shape).
|
||||||
|
|
||||||
|
Moreover, the trees tend to reuse the same features.
|
||||||
|
|
||||||
|
The function will project each tree on one, and keep for each position the
|
||||||
|
\code{features.keep} first features (based on Gain per feature measure).
|
||||||
|
|
||||||
|
This function is inspired by this blog post:
|
||||||
|
\url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
|
||||||
|
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||||
|
min_child_weight = 50)
|
||||||
|
|
||||||
|
p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
|
||||||
|
print(p)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,58 +1,48 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.plot.tree.R
|
% Please edit documentation in R/xgb.plot.tree.R
|
||||||
\name{xgb.plot.tree}
|
\name{xgb.plot.tree}
|
||||||
\alias{xgb.plot.tree}
|
\alias{xgb.plot.tree}
|
||||||
\title{Plot a boosted tree model}
|
\title{Plot a boosted tree model}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
|
xgb.plot.tree(feature_names = NULL, model = NULL, n_first_tree = NULL,
|
||||||
n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
|
plot.width = NULL, plot.height = NULL)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||||
|
|
||||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
|
|
||||||
|
|
||||||
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||||
|
|
||||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||||
|
|
||||||
\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
|
\item{plot.width}{the width of the diagram in pixels.}
|
||||||
|
|
||||||
\item{width}{the width of the diagram in pixels.}
|
\item{plot.height}{the height of the diagram in pixels.}
|
||||||
|
|
||||||
\item{height}{the height of the diagram in pixels.}
|
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
A \code{DiagrammeR} of the model.
|
A \code{DiagrammeR} of the model.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Read a tree model text dump.
|
Read a tree model text dump and plot the model.
|
||||||
Plotting only works for boosted tree model (not linear model).
|
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
The content of each node is organised that way:
|
The content of each node is organised that way:
|
||||||
|
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{feature} value ;
|
\item \code{feature} value;
|
||||||
\item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
|
\item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
|
||||||
\item \code{gain}: metric the importance of the node in the model.
|
\item \code{gain}: metric the importance of the node in the model.
|
||||||
}
|
}
|
||||||
|
|
||||||
Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
|
The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
|
||||||
It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
|
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
#Both dataset are list with two items, a sparse matrix and labels
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||||
#(labels = outcome column which will be learned).
|
|
||||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
train <- agaricus.train
|
|
||||||
|
|
||||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
|
||||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||||
|
|
||||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.save.R
|
% Please edit documentation in R/xgb.save.R
|
||||||
\name{xgb.save}
|
\name{xgb.save}
|
||||||
\alias{xgb.save}
|
\alias{xgb.save}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.save.raw.R
|
% Please edit documentation in R/xgb.save.raw.R
|
||||||
\name{xgb.save.raw}
|
\name{xgb.save.raw}
|
||||||
\alias{xgb.save.raw}
|
\alias{xgb.save.raw}
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.train.R
|
% Please edit documentation in R/xgb.train.R
|
||||||
\name{xgb.train}
|
\name{xgb.train}
|
||||||
\alias{xgb.train}
|
\alias{xgb.train}
|
||||||
\title{eXtreme Gradient Boosting Training}
|
\title{eXtreme Gradient Boosting Training}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
||||||
feval = NULL, verbose = 1, printEveryN=1L, early_stop_round = NULL,
|
feval = NULL, verbose = 1, print.every.n = 1L,
|
||||||
early.stop.round = NULL, maximize = NULL, ...)
|
early.stop.round = NULL, maximize = NULL, save_period = 0,
|
||||||
|
save_name = "xgboost.model", ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters.
|
\item{params}{the list of parameters.
|
||||||
@@ -26,7 +27,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
|||||||
\item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
|
\item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
|
||||||
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||||
\item \code{max_depth} maximum depth of a tree. Default: 6
|
\item \code{max_depth} maximum depth of a tree. Default: 6
|
||||||
\item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
\item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||||
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
|
||||||
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||||
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
|
||||||
@@ -43,19 +44,19 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
|||||||
3. Task Parameters
|
3. Task Parameters
|
||||||
|
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
|
\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{reg:linear} linear regression (Default).
|
\item \code{reg:linear} linear regression (Default).
|
||||||
\item \code{reg:logistic} logistic regression.
|
\item \code{reg:logistic} logistic regression.
|
||||||
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||||
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||||
\item \code{num_class} set the number of classes. To use only with multiclass objectives.
|
\item \code{num_class} set the number of classes. To use only with multiclass objectives.
|
||||||
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
|
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}.
|
||||||
\item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
|
\item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
|
||||||
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||||
}
|
}
|
||||||
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||||
\item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||||
}}
|
}}
|
||||||
|
|
||||||
\item{data}{takes an \code{xgb.DMatrix} as the input.}
|
\item{data}{takes an \code{xgb.DMatrix} as the input.}
|
||||||
@@ -63,10 +64,10 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
|||||||
\item{nrounds}{the max number of iterations}
|
\item{nrounds}{the max number of iterations}
|
||||||
|
|
||||||
\item{watchlist}{what information should be printed when \code{verbose=1} or
|
\item{watchlist}{what information should be printed when \code{verbose=1} or
|
||||||
\code{verbose=2}. Watchlist is used to specify validation set monitoring
|
\code{verbose=2}. Watchlist is used to specify validation set monitoring
|
||||||
during training. For example user can specify
|
during training. For example user can specify
|
||||||
watchlist=list(validation1=mat1, validation2=mat2) to watch
|
watchlist=list(validation1=mat1, validation2=mat2) to watch
|
||||||
the performance of each round's model on mat1 and mat2}
|
the performance of each round's model on mat1 and mat2}
|
||||||
|
|
||||||
\item{obj}{customized objective function. Returns gradient and second order
|
\item{obj}{customized objective function. Returns gradient and second order
|
||||||
gradient with given prediction and dtrain,}
|
gradient with given prediction and dtrain,}
|
||||||
@@ -78,17 +79,19 @@ prediction and dtrain,}
|
|||||||
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
|
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
|
||||||
information of performance. If 2, xgboost will print information of both}
|
information of performance. If 2, xgboost will print information of both}
|
||||||
|
|
||||||
\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
|
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
|
||||||
|
|
||||||
\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
|
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
|
||||||
If set to an integer \code{k}, training with a validation set will stop if the performance
|
If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||||
keeps getting worse consecutively for \code{k} rounds.}
|
keeps getting worse consecutively for \code{k} rounds.}
|
||||||
|
|
||||||
\item{early.stop.round}{An alternative of \code{early_stop_round}.}
|
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
|
||||||
|
|
||||||
\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
|
|
||||||
\code{maximize=TRUE} means the larger the evaluation score the better.}
|
\code{maximize=TRUE} means the larger the evaluation score the better.}
|
||||||
|
|
||||||
|
\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
|
||||||
|
|
||||||
|
\item{save_name}{the name or path for periodically saved model file.}
|
||||||
|
|
||||||
\item{...}{other parameters to pass to \code{params}.}
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
@@ -107,6 +110,7 @@ Number of threads can also be manually specified via \code{nthread} parameter.
|
|||||||
\itemize{
|
\itemize{
|
||||||
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||||
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||||
|
\item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
|
||||||
\item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
|
\item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
|
||||||
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
|
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
|
||||||
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||||
@@ -122,7 +126,6 @@ data(agaricus.train, package='xgboost')
|
|||||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
dtest <- dtrain
|
dtest <- dtrain
|
||||||
watchlist <- list(eval = dtest, train = dtrain)
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
param <- list(max.depth = 2, eta = 1, silent = 1)
|
|
||||||
logregobj <- function(preds, dtrain) {
|
logregobj <- function(preds, dtrain) {
|
||||||
labels <- getinfo(dtrain, "label")
|
labels <- getinfo(dtrain, "label")
|
||||||
preds <- 1/(1 + exp(-preds))
|
preds <- 1/(1 + exp(-preds))
|
||||||
@@ -135,6 +138,7 @@ evalerror <- function(preds, dtrain) {
|
|||||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||||
return(list(metric = "error", value = err))
|
return(list(metric = "error", value = err))
|
||||||
}
|
}
|
||||||
bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
|
param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
|
||||||
|
bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
% Generated by roxygen2 (4.1.1): do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgboost.R
|
% Please edit documentation in R/xgboost.R
|
||||||
\name{xgboost}
|
\name{xgboost}
|
||||||
\alias{xgboost}
|
\alias{xgboost}
|
||||||
\title{eXtreme Gradient Boosting (Tree) library}
|
\title{eXtreme Gradient Boosting (Tree) library}
|
||||||
\usage{
|
\usage{
|
||||||
xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
|
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
||||||
nrounds, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL,
|
params = list(), nrounds, verbose = 1, print.every.n = 1L,
|
||||||
maximize = NULL, ...)
|
early.stop.round = NULL, maximize = NULL, save_period = 0,
|
||||||
|
save_name = "xgboost.model", ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||||
@@ -18,6 +19,8 @@ if data is local data file or \code{xgb.DMatrix}.}
|
|||||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||||
value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
|
value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
|
||||||
|
|
||||||
|
\item{weight}{a vector indicating the weight for each row of the input.}
|
||||||
|
|
||||||
\item{params}{the list of parameters.
|
\item{params}{the list of parameters.
|
||||||
|
|
||||||
Commonly used ones are:
|
Commonly used ones are:
|
||||||
@@ -42,17 +45,19 @@ Commonly used ones are:
|
|||||||
information of performance. If 2, xgboost will print information of both
|
information of performance. If 2, xgboost will print information of both
|
||||||
performance and construction progress information}
|
performance and construction progress information}
|
||||||
|
|
||||||
\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
|
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
|
||||||
|
|
||||||
\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
|
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
|
||||||
If set to an integer \code{k}, training with a validation set will stop if the performance
|
If set to an integer \code{k}, training with a validation set will stop if the performance
|
||||||
keeps getting worse consecutively for \code{k} rounds.}
|
keeps getting worse consecutively for \code{k} rounds.}
|
||||||
|
|
||||||
\item{early.stop.round}{An alternative of \code{early_stop_round}.}
|
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
|
||||||
|
|
||||||
\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
|
|
||||||
\code{maximize=TRUE} means the larger the evaluation score the better.}
|
\code{maximize=TRUE} means the larger the evaluation score the better.}
|
||||||
|
|
||||||
|
\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
|
||||||
|
|
||||||
|
\item{save_name}{the name or path for periodically saved model file.}
|
||||||
|
|
||||||
\item{...}{other parameters to pass to \code{params}.}
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
@@ -73,5 +78,6 @@ test <- agaricus.test
|
|||||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||||
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
||||||
pred <- predict(bst, test$data)
|
pred <- predict(bst, test$data)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
// Copyright (c) 2014 by Contributors
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
@@ -31,14 +32,14 @@ extern "C" {
|
|||||||
bool CheckNAN(double v) {
|
bool CheckNAN(double v) {
|
||||||
return ISNAN(v);
|
return ISNAN(v);
|
||||||
}
|
}
|
||||||
bool LogGamma(double v) {
|
double LogGamma(double v) {
|
||||||
return lgammafn(v);
|
return lgammafn(v);
|
||||||
}
|
}
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
|
|
||||||
namespace random {
|
namespace random {
|
||||||
void Seed(unsigned seed) {
|
void Seed(unsigned seed) {
|
||||||
warning("parameter seed is ignored, please set random seed using set.seed");
|
// warning("parameter seed is ignored, please set random seed using set.seed");
|
||||||
}
|
}
|
||||||
double Uniform(void) {
|
double Uniform(void) {
|
||||||
return unif_rand();
|
return unif_rand();
|
||||||
@@ -58,6 +59,10 @@ inline void _WrapperEnd(void) {
|
|||||||
PutRNGstate();
|
PutRNGstate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// do nothing, check error
|
||||||
|
inline void CheckErr(int ret) {
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
SEXP XGCheckNullPtr_R(SEXP handle) {
|
SEXP XGCheckNullPtr_R(SEXP handle) {
|
||||||
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
|
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
|
||||||
@@ -69,7 +74,8 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
|
DMatrixHandle handle;
|
||||||
|
CheckErr(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
@@ -90,7 +96,8 @@ extern "C" {
|
|||||||
data[i * ncol +j] = din[i + nrow * j];
|
data[i * ncol +j] = din[i + nrow * j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
|
DMatrixHandle handle;
|
||||||
|
CheckErr(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
@@ -118,8 +125,10 @@ extern "C" {
|
|||||||
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
||||||
data_[i] = static_cast<float>(p_data[i]);
|
data_[i] = static_cast<float>(p_data[i]);
|
||||||
}
|
}
|
||||||
void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
|
DMatrixHandle handle;
|
||||||
BeginPtr(data_), nindptr, ndata);
|
CheckErr(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||||
|
BeginPtr(data_), nindptr, ndata,
|
||||||
|
&handle));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
@@ -133,7 +142,10 @@ extern "C" {
|
|||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
idxvec[i] = INTEGER(idxset)[i] - 1;
|
idxvec[i] = INTEGER(idxset)[i] - 1;
|
||||||
}
|
}
|
||||||
void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
|
DMatrixHandle res;
|
||||||
|
CheckErr(XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle),
|
||||||
|
BeginPtr(idxvec), len,
|
||||||
|
&res));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
@@ -142,8 +154,8 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
CheckErr(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(fname)), asInteger(silent));
|
CHAR(asChar(fname)), asInteger(silent)));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
||||||
@@ -156,24 +168,27 @@ extern "C" {
|
|||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
||||||
}
|
}
|
||||||
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
|
CheckErr(XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len));
|
||||||
} else {
|
} else {
|
||||||
std::vector<float> vec(len);
|
std::vector<float> vec(len);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
vec[i] = REAL(array)[i];
|
vec[i] = REAL(array)[i];
|
||||||
}
|
}
|
||||||
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
CheckErr(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(field)),
|
CHAR(asChar(field)),
|
||||||
BeginPtr(vec), len);
|
BeginPtr(vec), len));
|
||||||
}
|
}
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
|
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
|
const float *res;
|
||||||
CHAR(asChar(field)), &olen);
|
CheckErr(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
|
||||||
|
CHAR(asChar(field)),
|
||||||
|
&olen,
|
||||||
|
&res));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
||||||
for (size_t i = 0; i < olen; ++i) {
|
for (size_t i = 0; i < olen; ++i) {
|
||||||
@@ -183,23 +198,25 @@ extern "C" {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
SEXP XGDMatrixNumRow_R(SEXP handle) {
|
SEXP XGDMatrixNumRow_R(SEXP handle) {
|
||||||
bst_ulong nrow = XGDMatrixNumRow(R_ExternalPtrAddr(handle));
|
bst_ulong nrow;
|
||||||
|
CheckErr(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow));
|
||||||
return ScalarInteger(static_cast<int>(nrow));
|
return ScalarInteger(static_cast<int>(nrow));
|
||||||
}
|
}
|
||||||
// functions related to booster
|
// functions related to booster
|
||||||
void _BoosterFinalizer(SEXP ext) {
|
void _BoosterFinalizer(SEXP ext) {
|
||||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||||
XGBoosterFree(R_ExternalPtrAddr(ext));
|
CheckErr(XGBoosterFree(R_ExternalPtrAddr(ext)));
|
||||||
R_ClearExternalPtr(ext);
|
R_ClearExternalPtr(ext);
|
||||||
}
|
}
|
||||||
SEXP XGBoosterCreate_R(SEXP dmats) {
|
SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
int len = length(dmats);
|
int len = length(dmats);
|
||||||
std::vector<void*> dvec;
|
std::vector<void*> dvec;
|
||||||
for (int i = 0; i < len; ++i){
|
for (int i = 0; i < len; ++i) {
|
||||||
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
||||||
}
|
}
|
||||||
void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
|
BoosterHandle handle;
|
||||||
|
CheckErr(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
||||||
@@ -208,16 +225,16 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
XGBoosterSetParam(R_ExternalPtrAddr(handle),
|
CheckErr(XGBoosterSetParam(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(name)),
|
CHAR(asChar(name)),
|
||||||
CHAR(asChar(val)));
|
CHAR(asChar(val))));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
CheckErr(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
||||||
asInteger(iter),
|
asInteger(iter),
|
||||||
R_ExternalPtrAddr(dtrain));
|
R_ExternalPtrAddr(dtrain)));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
||||||
@@ -230,9 +247,10 @@ extern "C" {
|
|||||||
tgrad[j] = REAL(grad)[j];
|
tgrad[j] = REAL(grad)[j];
|
||||||
thess[j] = REAL(hess)[j];
|
thess[j] = REAL(hess)[j];
|
||||||
}
|
}
|
||||||
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
CheckErr(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
||||||
R_ExternalPtrAddr(dtrain),
|
R_ExternalPtrAddr(dtrain),
|
||||||
BeginPtr(tgrad), BeginPtr(thess), len);
|
BeginPtr(tgrad), BeginPtr(thess),
|
||||||
|
len));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
|
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
|
||||||
@@ -249,21 +267,24 @@ extern "C" {
|
|||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
vec_sptr.push_back(vec_names[i].c_str());
|
vec_sptr.push_back(vec_names[i].c_str());
|
||||||
}
|
}
|
||||||
const char *ret =
|
const char *ret;
|
||||||
XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
CheckErr(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
||||||
asInteger(iter),
|
asInteger(iter),
|
||||||
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len);
|
BeginPtr(vec_dmats),
|
||||||
|
BeginPtr(vec_sptr),
|
||||||
|
len, &ret));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
return mkString(ret);
|
return mkString(ret);
|
||||||
}
|
}
|
||||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
|
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
|
const float *res;
|
||||||
R_ExternalPtrAddr(dmat),
|
CheckErr(XGBoosterPredict(R_ExternalPtrAddr(handle),
|
||||||
asInteger(option_mask),
|
R_ExternalPtrAddr(dmat),
|
||||||
asInteger(ntree_limit),
|
asInteger(option_mask),
|
||||||
&olen);
|
asInteger(ntree_limit),
|
||||||
|
&olen, &res));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
||||||
for (size_t i = 0; i < olen; ++i) {
|
for (size_t i = 0; i < olen; ++i) {
|
||||||
@@ -274,12 +295,12 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
|
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
CheckErr(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
|
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
CheckErr(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
|
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
|
||||||
@@ -292,7 +313,8 @@ extern "C" {
|
|||||||
SEXP XGBoosterModelToRaw_R(SEXP handle) {
|
SEXP XGBoosterModelToRaw_R(SEXP handle) {
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
const char *raw = XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen);
|
const char *raw;
|
||||||
|
CheckErr(XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen, &raw));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP ret = PROTECT(allocVector(RAWSXP, olen));
|
SEXP ret = PROTECT(allocVector(RAWSXP, olen));
|
||||||
if (olen != 0) {
|
if (olen != 0) {
|
||||||
@@ -304,16 +326,16 @@ extern "C" {
|
|||||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
|
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
|
||||||
_WrapperBegin();
|
_WrapperBegin();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const char **res =
|
const char **res;
|
||||||
XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
CheckErr(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(fmap)),
|
CHAR(asChar(fmap)),
|
||||||
asInteger(with_stats),
|
asInteger(with_stats),
|
||||||
&olen);
|
&olen, &res));
|
||||||
_WrapperEnd();
|
_WrapperEnd();
|
||||||
SEXP out = PROTECT(allocVector(STRSXP, olen));
|
SEXP out = PROTECT(allocVector(STRSXP, olen));
|
||||||
for (size_t i = 0; i < olen; ++i) {
|
for (size_t i = 0; i < olen; ++i) {
|
||||||
stringstream stream;
|
stringstream stream;
|
||||||
stream << "booster["<<i<<"]\n" << res[i];
|
stream << "booster[" << i <<"]\n" << res[i];
|
||||||
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
|
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
|
||||||
}
|
}
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
#ifndef XGBOOST_WRAPPER_R_H_
|
|
||||||
#define XGBOOST_WRAPPER_R_H_
|
|
||||||
/*!
|
/*!
|
||||||
|
* Copyright 2014 (c) by Contributors
|
||||||
* \file xgboost_wrapper_R.h
|
* \file xgboost_wrapper_R.h
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
* \brief R wrapper of xgboost
|
* \brief R wrapper of xgboost
|
||||||
*/
|
*/
|
||||||
|
#ifndef XGBOOST_WRAPPER_R_H_ // NOLINT(*)
|
||||||
|
#define XGBOOST_WRAPPER_R_H_ // NOLINT(*)
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <Rinternals.h>
|
#include <Rinternals.h>
|
||||||
#include <R_ext/Random.h>
|
#include <R_ext/Random.h>
|
||||||
@@ -153,4 +155,4 @@ extern "C" {
|
|||||||
*/
|
*/
|
||||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
|
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
|
||||||
}
|
}
|
||||||
#endif // XGBOOST_WRAPPER_R_H_
|
#endif // XGBOOST_WRAPPER_R_H_ // NOLINT(*)
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
// Copyright (c) 2014 by Contributors
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <Rinternals.h>
|
#include <Rinternals.h>
|
||||||
|
|||||||
4
R-package/tests/testthat.R
Normal file
4
R-package/tests/testthat.R
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
library(testthat)
|
||||||
|
library(xgboost)
|
||||||
|
|
||||||
|
test_check("xgboost")
|
||||||
36
R-package/tests/testthat/test_basic.R
Normal file
36
R-package/tests/testthat/test_basic.R
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
require(xgboost)
|
||||||
|
|
||||||
|
context("basic functions")
|
||||||
|
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
data(agaricus.test, package='xgboost')
|
||||||
|
train <- agaricus.train
|
||||||
|
test <- agaricus.test
|
||||||
|
set.seed(1994)
|
||||||
|
|
||||||
|
test_that("train and predict", {
|
||||||
|
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||||
|
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
|
||||||
|
pred <- predict(bst, test$data)
|
||||||
|
expect_equal(length(pred), 1611)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("early stopping", {
|
||||||
|
res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
|
||||||
|
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
|
||||||
|
early.stop.round = 3, maximize = FALSE)
|
||||||
|
expect_true(nrow(res) < 20)
|
||||||
|
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||||
|
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
|
||||||
|
early.stop.round = 3, maximize = FALSE)
|
||||||
|
pred <- predict(bst, test$data)
|
||||||
|
expect_equal(length(pred), 1611)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("save_period", {
|
||||||
|
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||||
|
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
|
||||||
|
save_period = 10, save_name = "xgb.model")
|
||||||
|
pred <- predict(bst, test$data)
|
||||||
|
expect_equal(length(pred), 1611)
|
||||||
|
})
|
||||||
48
R-package/tests/testthat/test_custom_objective.R
Normal file
48
R-package/tests/testthat/test_custom_objective.R
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
context('Test models with custom objective')
|
||||||
|
|
||||||
|
require(xgboost)
|
||||||
|
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
data(agaricus.test, package='xgboost')
|
||||||
|
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
|
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||||
|
|
||||||
|
test_that("custom objective works", {
|
||||||
|
|
||||||
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
|
num_round <- 2
|
||||||
|
|
||||||
|
logregobj <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
preds <- 1 / (1 + exp(-preds))
|
||||||
|
grad <- preds - labels
|
||||||
|
hess <- preds * (1 - preds)
|
||||||
|
return(list(grad = grad, hess = hess))
|
||||||
|
}
|
||||||
|
evalerror <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||||
|
return(list(metric = "error", value = err))
|
||||||
|
}
|
||||||
|
|
||||||
|
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
|
||||||
|
objective=logregobj, eval_metric=evalerror)
|
||||||
|
|
||||||
|
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
expect_equal(class(bst), "xgb.Booster")
|
||||||
|
expect_equal(length(bst$raw), 1064)
|
||||||
|
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
|
||||||
|
|
||||||
|
logregobjattr <- function(preds, dtrain) {
|
||||||
|
labels <- attr(dtrain, 'label')
|
||||||
|
preds <- 1 / (1 + exp(-preds))
|
||||||
|
grad <- preds - labels
|
||||||
|
hess <- preds * (1 - preds)
|
||||||
|
return(list(grad = grad, hess = hess))
|
||||||
|
}
|
||||||
|
param <- list(max.depth=2, eta=1, nthread = 2, silent = 1,
|
||||||
|
objective = logregobjattr, eval_metric = evalerror)
|
||||||
|
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
expect_equal(class(bst), "xgb.Booster")
|
||||||
|
expect_equal(length(bst$raw), 1064)
|
||||||
|
})
|
||||||
19
R-package/tests/testthat/test_glm.R
Normal file
19
R-package/tests/testthat/test_glm.R
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
context('Test generalized linear models')
|
||||||
|
|
||||||
|
require(xgboost)
|
||||||
|
|
||||||
|
test_that("glm works", {
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
data(agaricus.test, package='xgboost')
|
||||||
|
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
|
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||||
|
expect_equal(class(dtrain), "xgb.DMatrix")
|
||||||
|
expect_equal(class(dtest), "xgb.DMatrix")
|
||||||
|
param <- list(objective = "binary:logistic", booster = "gblinear",
|
||||||
|
nthread = 2, alpha = 0.0001, lambda = 1)
|
||||||
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
|
num_round <- 2
|
||||||
|
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
ypred <- predict(bst, dtest)
|
||||||
|
expect_equal(length(getinfo(dtest, 'label')), 1611)
|
||||||
|
})
|
||||||
68
R-package/tests/testthat/test_helpers.R
Normal file
68
R-package/tests/testthat/test_helpers.R
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
context('Test helper functions')
|
||||||
|
|
||||||
|
require(xgboost)
|
||||||
|
require(data.table)
|
||||||
|
require(Matrix)
|
||||||
|
require(vcd)
|
||||||
|
|
||||||
|
set.seed(1982)
|
||||||
|
data(Arthritis)
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
df <- data.table(Arthritis, keep.rownames = F)
|
||||||
|
df[,AgeDiscret := as.factor(round(Age / 10,0))]
|
||||||
|
df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||||
|
df[,ID := NULL]
|
||||||
|
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
|
||||||
|
output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y]
|
||||||
|
bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
||||||
|
eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gbtree")
|
||||||
|
|
||||||
|
bst.GLM <- xgboost(data = sparse_matrix, label = output_vector,
|
||||||
|
eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear")
|
||||||
|
|
||||||
|
feature.names <- agaricus.train$data@Dimnames[[2]]
|
||||||
|
|
||||||
|
test_that("xgb.dump works", {
|
||||||
|
capture.output(print(xgb.dump(bst.Tree)))
|
||||||
|
capture.output(print(xgb.dump(bst.GLM)))
|
||||||
|
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with.stats = T))
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.model.dt.tree works with and without feature names", {
|
||||||
|
names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover",
|
||||||
|
"Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality")
|
||||||
|
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
||||||
|
expect_equal(names.dt.trees, names(dt.tree))
|
||||||
|
expect_equal(dim(dt.tree), c(162, 15))
|
||||||
|
xgb.model.dt.tree(model = bst.Tree)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.importance works with and without feature names", {
|
||||||
|
importance.Tree <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.Tree)
|
||||||
|
expect_equal(dim(importance.Tree), c(7, 4))
|
||||||
|
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
|
||||||
|
xgb.importance(model = bst.Tree)
|
||||||
|
xgb.plot.importance(importance_matrix = importance.Tree)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.importance works with GLM model", {
|
||||||
|
importance.GLM <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.GLM)
|
||||||
|
expect_equal(dim(importance.GLM), c(10, 2))
|
||||||
|
expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
|
||||||
|
xgb.importance(model = bst.GLM)
|
||||||
|
xgb.plot.importance(importance.GLM)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.plot.tree works with and without feature names", {
|
||||||
|
xgb.plot.tree(feature_names = feature.names, model = bst.Tree)
|
||||||
|
xgb.plot.tree(model = bst.Tree)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.plot.multi.trees works with and without feature names", {
|
||||||
|
xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features.keep = 3)
|
||||||
|
xgb.plot.multi.trees(model = bst.Tree, features.keep = 3)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.plot.deepness works", {
|
||||||
|
xgb.plot.deepness(model = bst.Tree)
|
||||||
|
})
|
||||||
27
R-package/tests/testthat/test_lint.R
Normal file
27
R-package/tests/testthat/test_lint.R
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
context("Code is of high quality and lint free")
|
||||||
|
test_that("Code Lint", {
|
||||||
|
skip_on_cran()
|
||||||
|
skip_on_travis()
|
||||||
|
skip_if_not_installed("lintr")
|
||||||
|
my_linters <- list(
|
||||||
|
absolute_paths_linter=lintr::absolute_paths_linter,
|
||||||
|
assignment_linter=lintr::assignment_linter,
|
||||||
|
closed_curly_linter=lintr::closed_curly_linter,
|
||||||
|
commas_linter=lintr::commas_linter,
|
||||||
|
# commented_code_linter=lintr::commented_code_linter,
|
||||||
|
infix_spaces_linter=lintr::infix_spaces_linter,
|
||||||
|
line_length_linter=lintr::line_length_linter,
|
||||||
|
no_tab_linter=lintr::no_tab_linter,
|
||||||
|
object_usage_linter=lintr::object_usage_linter,
|
||||||
|
# snake_case_linter=lintr::snake_case_linter,
|
||||||
|
# multiple_dots_linter=lintr::multiple_dots_linter,
|
||||||
|
object_length_linter=lintr::object_length_linter,
|
||||||
|
open_curly_linter=lintr::open_curly_linter,
|
||||||
|
# single_quotes_linter=lintr::single_quotes_linter,
|
||||||
|
spaces_inside_linter=lintr::spaces_inside_linter,
|
||||||
|
spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
|
||||||
|
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
|
||||||
|
trailing_whitespace_linter=lintr::trailing_whitespace_linter
|
||||||
|
)
|
||||||
|
# lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
|
||||||
|
})
|
||||||
32
R-package/tests/testthat/test_parameter_exposure.R
Normal file
32
R-package/tests/testthat/test_parameter_exposure.R
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
context('Test model params and call are exposed to R')
|
||||||
|
|
||||||
|
require(xgboost)
|
||||||
|
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
data(agaricus.test, package='xgboost')
|
||||||
|
|
||||||
|
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
|
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||||
|
|
||||||
|
bst <- xgboost(data = dtrain,
|
||||||
|
max.depth = 2,
|
||||||
|
eta = 1,
|
||||||
|
nround = 10,
|
||||||
|
nthread = 1,
|
||||||
|
verbose = 0,
|
||||||
|
objective = "binary:logistic")
|
||||||
|
|
||||||
|
test_that("call is exposed to R", {
|
||||||
|
model_call <- attr(bst, "call")
|
||||||
|
expect_is(model_call, "call")
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("params is exposed to R", {
|
||||||
|
model_params <- attr(bst, "params")
|
||||||
|
|
||||||
|
expect_is(model_params, "list")
|
||||||
|
|
||||||
|
expect_equal(model_params$eta, 1)
|
||||||
|
expect_equal(model_params$max.depth, 2)
|
||||||
|
expect_equal(model_params$objective, "binary:logistic")
|
||||||
|
})
|
||||||
14
R-package/tests/testthat/test_poisson_regression.R
Normal file
14
R-package/tests/testthat/test_poisson_regression.R
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
context('Test poisson regression model')
|
||||||
|
|
||||||
|
require(xgboost)
|
||||||
|
set.seed(1994)
|
||||||
|
|
||||||
|
test_that("poisson regression works", {
|
||||||
|
data(mtcars)
|
||||||
|
bst <- xgboost(data = as.matrix(mtcars[,-11]),label = mtcars[,11],
|
||||||
|
objective = 'count:poisson', nrounds=5)
|
||||||
|
expect_equal(class(bst), "xgb.Booster")
|
||||||
|
pred <- predict(bst,as.matrix(mtcars[, -11]))
|
||||||
|
expect_equal(length(pred), 32)
|
||||||
|
expect_equal(sqrt(mean( (pred - mtcars[,11]) ^ 2)), 1.16, tolerance = 0.01)
|
||||||
|
})
|
||||||
@@ -190,7 +190,7 @@ Measure feature importance
|
|||||||
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
|
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
|
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
|
||||||
head(importance)
|
head(importance)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -202,7 +202,7 @@ head(importance)
|
|||||||
|
|
||||||
`Cover` measures the relative quantity of observations concerned by a feature.
|
`Cover` measures the relative quantity of observations concerned by a feature.
|
||||||
|
|
||||||
`Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
|
`Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
|
||||||
|
|
||||||
### Improvement in the interpretability of feature importance data.table
|
### Improvement in the interpretability of feature importance data.table
|
||||||
|
|
||||||
@@ -213,10 +213,10 @@ One simple solution is to count the co-occurrences of a feature and a class of t
|
|||||||
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
|
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
|
importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
|
||||||
|
|
||||||
# Cleaning for better display
|
# Cleaning for better display
|
||||||
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequence=NULL)]
|
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
|
||||||
|
|
||||||
head(importanceClean)
|
head(importanceClean)
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -57,16 +57,14 @@ devtools::install_github('dmlc/xgboost', subdir='R-package')
|
|||||||
Cran version
|
Cran version
|
||||||
------------
|
------------
|
||||||
|
|
||||||
For stable version on *CRAN*, run:
|
As of 2015-03-13, ‘xgboost’ was removed from the CRAN repository.
|
||||||
|
|
||||||
```{r installCran, eval=FALSE}
|
Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
|
||||||
install.packages('xgboost')
|
|
||||||
```
|
|
||||||
|
|
||||||
Learning
|
Learning
|
||||||
========
|
========
|
||||||
|
|
||||||
For the purpose of this tutorial we will load **Xgboost** package.
|
For the purpose of this tutorial we will load **XGBoost** package.
|
||||||
|
|
||||||
```{r libLoading, results='hold', message=F, warning=F}
|
```{r libLoading, results='hold', message=F, warning=F}
|
||||||
require(xgboost)
|
require(xgboost)
|
||||||
@@ -117,7 +115,7 @@ dim(train$data)
|
|||||||
dim(test$data)
|
dim(test$data)
|
||||||
```
|
```
|
||||||
|
|
||||||
This dataset is very small to not make the **R** package too heavy, however **Xgboost** is built to manage huge dataset very efficiently.
|
This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge dataset very efficiently.
|
||||||
|
|
||||||
As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
|
As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
|
||||||
|
|
||||||
@@ -126,7 +124,7 @@ class(train$data)[1]
|
|||||||
class(train$label)
|
class(train$label)
|
||||||
```
|
```
|
||||||
|
|
||||||
Basic Training using Xgboost
|
Basic Training using XGBoost
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
This step is the most critical part of the process for the quality of our model.
|
This step is the most critical part of the process for the quality of our model.
|
||||||
@@ -162,7 +160,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth
|
|||||||
|
|
||||||
#### xgb.DMatrix
|
#### xgb.DMatrix
|
||||||
|
|
||||||
**Xgboost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.
|
**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
|
||||||
|
|
||||||
```{r trainingDmatrix, message=F, warning=F}
|
```{r trainingDmatrix, message=F, warning=F}
|
||||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||||
@@ -171,7 +169,7 @@ bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround
|
|||||||
|
|
||||||
#### Verbose option
|
#### Verbose option
|
||||||
|
|
||||||
**Xgboost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
|
**XGBoost** has several features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
|
||||||
|
|
||||||
One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
|
One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
|
||||||
|
|
||||||
@@ -190,13 +188,13 @@ bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, o
|
|||||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
|
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
|
||||||
```
|
```
|
||||||
|
|
||||||
Basic prediction using Xgboost
|
Basic prediction using XGBoost
|
||||||
==============================
|
==============================
|
||||||
|
|
||||||
Perform the prediction
|
Perform the prediction
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
The pupose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
|
The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
|
||||||
|
|
||||||
```{r predicting, message=F, warning=F}
|
```{r predicting, message=F, warning=F}
|
||||||
pred <- predict(bst, test$data)
|
pred <- predict(bst, test$data)
|
||||||
@@ -213,7 +211,7 @@ These numbers doesn't look like *binary classification* `{0,1}`. We need to perf
|
|||||||
Transform the regression in a binary classification
|
Transform the regression in a binary classification
|
||||||
---------------------------------------------------
|
---------------------------------------------------
|
||||||
|
|
||||||
The only thing that **Xgboost** does is a *regression*. **Xgboost** is using `label` vector to build its *regression* model.
|
The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model.
|
||||||
|
|
||||||
How can we use a *regression* model to perform a binary classification?
|
How can we use a *regression* model to perform a binary classification?
|
||||||
|
|
||||||
@@ -269,9 +267,9 @@ Measure learning progress with xgb.train
|
|||||||
|
|
||||||
Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
|
Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
|
||||||
|
|
||||||
One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
|
One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following techniques will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
|
||||||
|
|
||||||
One way to measure progress in learning of a model is to provide to **Xgboost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
|
One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
|
||||||
|
|
||||||
> in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
|
> in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
|
||||||
|
|
||||||
@@ -283,11 +281,11 @@ watchlist <- list(train=dtrain, test=dtest)
|
|||||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
|
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
|
||||||
```
|
```
|
||||||
|
|
||||||
**Xgboost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
|
**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
|
||||||
|
|
||||||
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
|
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
|
||||||
|
|
||||||
If with your own dataset you have not such results, you should think about how you did to divide your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
|
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
|
||||||
|
|
||||||
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
|
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
|
||||||
|
|
||||||
@@ -300,7 +298,7 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli
|
|||||||
Linear boosting
|
Linear boosting
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
Until know, all the learnings we have performed were based on boosting trees. **Xgboost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
|
Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
|
||||||
|
|
||||||
```{r linearBoosting, message=F, warning=F}
|
```{r linearBoosting, message=F, warning=F}
|
||||||
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
|
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
|
||||||
@@ -308,7 +306,7 @@ bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nr
|
|||||||
|
|
||||||
In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
|
In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
|
||||||
|
|
||||||
In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
|
In simple cases, it will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
|
||||||
|
|
||||||
Manipulating xgb.DMatrix
|
Manipulating xgb.DMatrix
|
||||||
------------------------
|
------------------------
|
||||||
@@ -339,6 +337,17 @@ err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
|||||||
print(paste("test-error=", err))
|
print(paste("test-error=", err))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
View feature importance/influence from the learnt model
|
||||||
|
-------------------------------------------------------
|
||||||
|
|
||||||
|
Feature importance is similar to R gbm package's relative influence (rel.inf).
|
||||||
|
|
||||||
|
```
|
||||||
|
importance_matrix <- xgb.importance(model = bst)
|
||||||
|
print(importance_matrix)
|
||||||
|
xgb.plot.importance(importance_matrix = importance_matrix)
|
||||||
|
```
|
||||||
|
|
||||||
View the trees from a model
|
View the trees from a model
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
||||||
@@ -348,14 +357,20 @@ You can dump the tree you learned using `xgb.dump` into a text file.
|
|||||||
xgb.dump(bst, with.stats = T)
|
xgb.dump(bst, with.stats = T)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can plot the trees from your model using ```xgb.plot.tree``
|
||||||
|
|
||||||
|
```
|
||||||
|
xgb.plot.tree(model = bst)
|
||||||
|
```
|
||||||
|
|
||||||
> if you provide a path to `fname` parameter you can save the trees to your hard drive.
|
> if you provide a path to `fname` parameter you can save the trees to your hard drive.
|
||||||
|
|
||||||
Save and load models
|
Save and load models
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
|
Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
|
||||||
|
|
||||||
Hopefully for you, **Xgboost** implements such functions.
|
Hopefully for you, **XGBoost** implements such functions.
|
||||||
|
|
||||||
```{r saveModel, message=F, warning=F}
|
```{r saveModel, message=F, warning=F}
|
||||||
# save model to binary local file
|
# save model to binary local file
|
||||||
@@ -364,7 +379,7 @@ xgb.save(bst, "xgboost.model")
|
|||||||
|
|
||||||
> `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise.
|
> `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise.
|
||||||
|
|
||||||
An interesting test to see how identic is our saved model with the original one would be to compare the two predictions.
|
An interesting test to see how identical our saved model is to the original one would be to compare the two predictions.
|
||||||
|
|
||||||
```{r loadModel, message=F, warning=F}
|
```{r loadModel, message=F, warning=F}
|
||||||
# load binary model to R
|
# load binary model to R
|
||||||
@@ -382,7 +397,7 @@ file.remove("./xgboost.model")
|
|||||||
|
|
||||||
> result is `0`? We are good!
|
> result is `0`? We are good!
|
||||||
|
|
||||||
In some very specific cases, like when you want to pilot **Xgboost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
|
In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
|
||||||
|
|
||||||
```{r saveLoadRBinVectorModel, message=F, warning=F}
|
```{r saveLoadRBinVectorModel, message=F, warning=F}
|
||||||
# save model to R's raw vector
|
# save model to R's raw vector
|
||||||
@@ -399,7 +414,7 @@ pred3 <- predict(bst3, test$data)
|
|||||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
||||||
```
|
```
|
||||||
|
|
||||||
> Again `0`? It seems that `Xgboost` works pretty well!
|
> Again `0`? It seems that `XGBoost` works pretty well!
|
||||||
|
|
||||||
References
|
References
|
||||||
==========
|
==========
|
||||||
|
|||||||
113
README.md
113
README.md
@@ -1,57 +1,84 @@
|
|||||||
XGBoost: eXtreme Gradient Boosting
|
<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/xgboost.png width=135/> eXtreme Gradient Boosting
|
||||||
==================================
|
===========
|
||||||
|
[](https://travis-ci.org/dmlc/xgboost)
|
||||||
|
[](https://xgboost.readthedocs.org)
|
||||||
|
[](./LICENSE)
|
||||||
|
[](http://cran.r-project.org/web/packages/xgboost)
|
||||||
|
[](https://pypi.python.org/pypi/xgboost/)
|
||||||
|
[](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||||
|
|
||||||
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
|
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
|
||||||
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data
|
|
||||||
|
|
||||||
Contributors: https://github.com/dmlc/xgboost/graphs/contributors
|
It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data
|
||||||
|
|
||||||
Documentations: [Documentation of xgboost](doc/README.md)
|
XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) <img src=https://avatars2.githubusercontent.com/u/11508361?v=3&s=20> projects
|
||||||
|
|
||||||
Issues Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion)
|
Contents
|
||||||
|
--------
|
||||||
Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) to ask questions and share your experience on xgboost.
|
* [What's New](#whats-new)
|
||||||
- Use issue tracker for bug reports, feature requests etc.
|
* [Version](#version)
|
||||||
- Use the user group to post your experience, ask questions about general usages.
|
* [Documentation](doc/index.md)
|
||||||
|
* [Build Instruction](doc/build.md)
|
||||||
Gitter for developers [](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
* [Features](#features)
|
||||||
|
* [Distributed XGBoost](multi-node)
|
||||||
Distributed Version: [Distributed XGBoost](multi-node)
|
* [Usecases](doc/index.md#highlight-links)
|
||||||
|
* [Bug Reporting](#bug-reporting)
|
||||||
Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
|
* [Contributing to XGBoost](#contributing-to-xgboost)
|
||||||
|
* [Committers and Contributors](CONTRIBUTORS.md)
|
||||||
|
* [License](#license)
|
||||||
|
* [XGBoost in Graphlab Create](#xgboost-in-graphlab-create)
|
||||||
|
|
||||||
What's New
|
What's New
|
||||||
==========
|
----------
|
||||||
|
|
||||||
|
* XGBoost helps Vlad Mironov, Alexander Guschin to win the [CERN LHCb experiment Flavour of Physics competition](https://www.kaggle.com/c/flavours-of-physics). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/11/30/flavour-of-physics-technical-write-up-1st-place-go-polar-bears/).
|
||||||
|
* XGBoost helps Mario Filho, Josef Feigl, Lucas, Gilberto to win the [Caterpillar Tube Pricing competition](https://www.kaggle.com/c/caterpillar-tube-pricing). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/09/22/caterpillar-winners-interview-1st-place-gilberto-josef-leustagos-mario/).
|
||||||
|
* XGBoost helps Halla Yang to win the [Recruit Coupon Purchase Prediction Challenge](https://www.kaggle.com/c/coupon-purchase-prediction). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/10/21/recruit-coupon-purchase-winners-interview-2nd-place-halla-yang/).
|
||||||
|
* XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
|
||||||
|
* XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
|
||||||
|
Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
|
||||||
* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
|
* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
|
||||||
* XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
|
* XGBoost helps three champion teams to win [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
|
||||||
- Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
|
Check out the [winning solution](doc/README.md#highlight-links)
|
||||||
* [External Memory Version](doc/external_memory.md)
|
* [External Memory Version](doc/external_memory.md)
|
||||||
|
|
||||||
Features
|
|
||||||
========
|
|
||||||
* Easily accessible in python, R, Julia, CLI
|
|
||||||
* Fast speed and memory efficient
|
|
||||||
- Can be more than 10 times faster than GBM in sklearn and R
|
|
||||||
- Handles sparse matrices, support external memory
|
|
||||||
* Accurate prediction, and used extensively by data scientists and kagglers
|
|
||||||
- See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
|
|
||||||
* Distributed and Portable
|
|
||||||
- The distributed version runs on Hadoop (YARN), MPI, SGE etc.
|
|
||||||
- Scales to billions of examples and beyond
|
|
||||||
|
|
||||||
Build
|
|
||||||
=======
|
|
||||||
* Run ```bash build.sh``` (you can also type make)
|
|
||||||
- Normally it gives what you want
|
|
||||||
- See [Build Instruction](doc/build.md) for more information
|
|
||||||
|
|
||||||
Version
|
Version
|
||||||
=======
|
-------
|
||||||
* Current version xgboost-0.4, a lot improvment has been made since 0.3
|
|
||||||
- Change log in [CHANGES.md](CHANGES.md)
|
* Current version xgboost-0.4
|
||||||
|
- [Change log](CHANGES.md)
|
||||||
- This version is compatible with 0.3x versions
|
- This version is compatible with 0.3x versions
|
||||||
|
|
||||||
|
Features
|
||||||
|
--------
|
||||||
|
* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py),
|
||||||
|
[R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R),
|
||||||
|
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
|
||||||
|
* Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml)
|
||||||
|
* Memory efficient - Handles sparse matrices, supports external memory
|
||||||
|
* Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
|
||||||
|
* Distributed version runs on Hadoop (YARN), MPI, SGE etc., scales to billions of examples.
|
||||||
|
|
||||||
|
Bug Reporting
|
||||||
|
-------------
|
||||||
|
|
||||||
|
* For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page.
|
||||||
|
* For generic questions or to share your experience using xgboost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
|
||||||
|
|
||||||
|
|
||||||
|
Contributing to XGBoost
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
|
||||||
|
* Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something.
|
||||||
|
* Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users.
|
||||||
|
* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged.
|
||||||
|
|
||||||
|
License
|
||||||
|
-------
|
||||||
|
© Contributors, 2015. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license.
|
||||||
|
|
||||||
XGBoost in Graphlab Create
|
XGBoost in Graphlab Create
|
||||||
==========================
|
--------------------------
|
||||||
* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html
|
* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html)
|
||||||
* Nice blogpost by Jay Gu using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand
|
* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge:
|
||||||
|
|||||||
36
appveyor.yml
Normal file
36
appveyor.yml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
environment:
|
||||||
|
global:
|
||||||
|
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd"
|
||||||
|
DISABLE_OPENMP: 1
|
||||||
|
VisualStudioVersion: 12.0
|
||||||
|
|
||||||
|
matrix:
|
||||||
|
- PYTHON: "C:\\Python27-x64"
|
||||||
|
PYTHON_VERSION: "2.7.x" # currently 2.7.9
|
||||||
|
PYTHON_ARCH: "64"
|
||||||
|
|
||||||
|
- PYTHON: "C:\\Python33-x64"
|
||||||
|
PYTHON_VERSION: "3.3.x" # currently 3.3.5
|
||||||
|
PYTHON_ARCH: "64"
|
||||||
|
|
||||||
|
platform:
|
||||||
|
- x64
|
||||||
|
|
||||||
|
configuration:
|
||||||
|
- Release
|
||||||
|
|
||||||
|
install:
|
||||||
|
- cmd: git clone https://github.com/ogrisel/python-appveyor-demo
|
||||||
|
- ECHO "Filesystem root:"
|
||||||
|
- ps: "ls \"C:/\""
|
||||||
|
|
||||||
|
- ECHO "Installed SDKs:"
|
||||||
|
- ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\""
|
||||||
|
|
||||||
|
- ps: python-appveyor-demo\appveyor\install.ps1
|
||||||
|
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
|
||||||
|
- "python --version"
|
||||||
|
- "python -c \"import struct; print(struct.calcsize('P') * 8)\""
|
||||||
|
|
||||||
|
build: off
|
||||||
|
#project: windows\xgboost.sln
|
||||||
12
build.sh
12
build.sh
@@ -6,6 +6,18 @@
|
|||||||
|
|
||||||
# See additional instruction in doc/build.md
|
# See additional instruction in doc/build.md
|
||||||
|
|
||||||
|
#for building static OpenMP lib in MAC for easier installation in MAC
|
||||||
|
#doesn't work with XCode clang/LLVM since Apple doesn't support,
|
||||||
|
#needs brew install gcc 4.9+ with OpenMP. By default the static link is OFF
|
||||||
|
static_omp=0
|
||||||
|
if ((${static_omp}==1)); then
|
||||||
|
rm libgomp.a
|
||||||
|
ln -s `g++ -print-file-name=libgomp.a`
|
||||||
|
make clean
|
||||||
|
make omp_mac_static=1
|
||||||
|
echo "Successfully build multi-thread static link xgboost"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
if make; then
|
if make; then
|
||||||
echo "Successfully build multi-thread xgboost"
|
echo "Successfully build multi-thread xgboost"
|
||||||
|
|||||||
1
demo/.gitignore
vendored
1
demo/.gitignore
vendored
@@ -1 +1,2 @@
|
|||||||
*.libsvm
|
*.libsvm
|
||||||
|
*.pkl
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
XGBoost Examples
|
XGBoost Code Examples
|
||||||
====
|
=====================
|
||||||
This folder contains all the code examples using xgboost.
|
This folder contains all the code examples using xgboost.
|
||||||
|
|
||||||
* Contribution of examples, benchmarks is more than welcome!
|
* Contribution of examples, benchmarks is more than welcome!
|
||||||
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
||||||
|
|
||||||
Features Walkthrough
|
Features Walkthrough
|
||||||
====
|
--------------------
|
||||||
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
|
This is a list of short codes introducing different functionalities of xgboost packages.
|
||||||
* Basic walkthrough of wrappers
|
* Basic walkthrough of packages
|
||||||
[python](guide-python/basic_walkthrough.py)
|
[python](guide-python/basic_walkthrough.py)
|
||||||
[R](../R-package/demo/basic_walkthrough.R)
|
[R](../R-package/demo/basic_walkthrough.R)
|
||||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
|
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
|
||||||
@@ -22,8 +22,8 @@ This is a list of short codes introducing different functionalities of xgboost a
|
|||||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
|
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
|
||||||
* Predicting using first n trees
|
* Predicting using first n trees
|
||||||
[python](guide-python/predict_first_ntree.py)
|
[python](guide-python/predict_first_ntree.py)
|
||||||
[R](../R-package/demo/boost_from_prediction.R)
|
[R](../R-package/demo/predict_first_ntree.R)
|
||||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
|
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/predict_first_ntree.jl)
|
||||||
* Generalized Linear Model
|
* Generalized Linear Model
|
||||||
[python](guide-python/generalized_linear_model.py)
|
[python](guide-python/generalized_linear_model.py)
|
||||||
[R](../R-package/demo/generalized_linear_model.R)
|
[R](../R-package/demo/generalized_linear_model.R)
|
||||||
@@ -37,7 +37,7 @@ This is a list of short codes introducing different functionalities of xgboost a
|
|||||||
[R](../R-package/demo/predict_leaf_indices.R)
|
[R](../R-package/demo/predict_leaf_indices.R)
|
||||||
|
|
||||||
Basic Examples by Tasks
|
Basic Examples by Tasks
|
||||||
====
|
-----------------------
|
||||||
Most of examples in this section are based on CLI or python version.
|
Most of examples in this section are based on CLI or python version.
|
||||||
However, the parameter settings can be applied to all versions
|
However, the parameter settings can be applied to all versions
|
||||||
* [Binary classification](binary_classification)
|
* [Binary classification](binary_classification)
|
||||||
@@ -46,7 +46,6 @@ However, the parameter settings can be applied to all versions
|
|||||||
* [Learning to Rank](rank)
|
* [Learning to Rank](rank)
|
||||||
|
|
||||||
Benchmarks
|
Benchmarks
|
||||||
====
|
----------
|
||||||
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)
|
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)
|
||||||
* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
|
* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
|
||||||
|
|
||||||
|
|||||||
@@ -147,7 +147,7 @@ Run the command again, we can find the log file becomes
|
|||||||
```
|
```
|
||||||
The rule is eval[name-printed-in-log] = filename, then the file will be added to monitoring process, and evaluated each round.
|
The rule is eval[name-printed-in-log] = filename, then the file will be added to monitoring process, and evaluated each round.
|
||||||
|
|
||||||
xgboost also support monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes
|
xgboost also supports monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes
|
||||||
```
|
```
|
||||||
[0] test-error:0.016139 test-negllik:0.029795 trainname-error:0.014433 trainname-negllik:0.027023
|
[0] test-error:0.016139 test-negllik:0.029795 trainname-error:0.014433 trainname-negllik:0.027023
|
||||||
[1] test-error:0.000000 test-negllik:0.000000 trainname-error:0.001228 trainname-negllik:0.002457
|
[1] test-error:0.000000 test-negllik:0.000000 trainname-error:0.001228 trainname-negllik:0.002457
|
||||||
@@ -162,11 +162,15 @@ If you want to continue boosting from existing model, say 0002.model, use
|
|||||||
```
|
```
|
||||||
xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
|
xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
|
||||||
#### Use Multi-Threading
|
#### Use Multi-Threading
|
||||||
When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running threads to 10, add ```nthread=10``` to your configuration.
|
When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running add ```nthread``` parameter to you configuration.
|
||||||
|
Eg. ```nthread=10```
|
||||||
|
|
||||||
|
Set nthread to be the number of your real cpu (On Unix, this can be found using ```lscpu```)
|
||||||
|
Some systems will have ```Thread(s) per core = 2```, for example, a 4 core cpu with 8 threads, in such case set ```nthread=4``` and not 8.
|
||||||
|
|
||||||
#### Additional Notes
|
#### Additional Notes
|
||||||
* What are ```agaricus.txt.test.buffer``` and ```agaricus.txt.train.buffer``` generated during runexp.sh?
|
* What are ```agaricus.txt.test.buffer``` and ```agaricus.txt.train.buffer``` generated during runexp.sh?
|
||||||
- By default xgboost will automatically generate a binary format buffer of input data, with suffix ```buffer```. When next time you run xgboost, it detects i
|
- By default xgboost will automatically generate a binary format buffer of input data, with suffix ```buffer```. Next time when you run xgboost, it will detects these binary files.
|
||||||
Demonstrating how to use XGBoost accomplish binary classification tasks on UCI mushroom dataset http://archive.ics.uci.edu/ml/datasets/Mushroom
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
XGBoost Python Feature Walkthrough
|
XGBoost Python Feature Walkthrough
|
||||||
====
|
==================================
|
||||||
* [Basic walkthrough of wrappers](basic_walkthrough.py)
|
* [Basic walkthrough of wrappers](basic_walkthrough.py)
|
||||||
* [Cutomize loss function, and evaluation metric](custom_objective.py)
|
* [Cutomize loss function, and evaluation metric](custom_objective.py)
|
||||||
* [Boosting from existing prediction](boost_from_prediction.py)
|
* [Boosting from existing prediction](boost_from_prediction.py)
|
||||||
@@ -7,5 +7,8 @@ XGBoost Python Feature Walkthrough
|
|||||||
* [Generalized Linear Model](generalized_linear_model.py)
|
* [Generalized Linear Model](generalized_linear_model.py)
|
||||||
* [Cross validation](cross_validation.py)
|
* [Cross validation](cross_validation.py)
|
||||||
* [Predicting leaf indices](predict_leaf_indices.py)
|
* [Predicting leaf indices](predict_leaf_indices.py)
|
||||||
* [Sklearn Wrapper](sklearn_example.py)
|
* [Sklearn Wrapper](sklearn_examples.py)
|
||||||
|
* [Sklearn Parallel](sklearn_parallel.py)
|
||||||
|
* [Sklearn access evals result](sklearn_evals_result.py)
|
||||||
|
* [Access evals result](evals_result.py)
|
||||||
* [External Memory](external_memory.py)
|
* [External Memory](external_memory.py)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.sparse
|
import scipy.sparse
|
||||||
|
import pickle
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
|
||||||
### simple example
|
### simple example
|
||||||
@@ -19,7 +20,7 @@ bst = xgb.train(param, dtrain, num_round, watchlist)
|
|||||||
# this is prediction
|
# this is prediction
|
||||||
preds = bst.predict(dtest)
|
preds = bst.predict(dtest)
|
||||||
labels = dtest.get_label()
|
labels = dtest.get_label()
|
||||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||||
bst.save_model('0001.model')
|
bst.save_model('0001.model')
|
||||||
# dump model
|
# dump model
|
||||||
bst.dump_model('dump.raw.txt')
|
bst.dump_model('dump.raw.txt')
|
||||||
@@ -28,6 +29,7 @@ bst.dump_model('dump.nice.txt','../data/featmap.txt')
|
|||||||
|
|
||||||
# save dmatrix into binary buffer
|
# save dmatrix into binary buffer
|
||||||
dtest.save_binary('dtest.buffer')
|
dtest.save_binary('dtest.buffer')
|
||||||
|
# save model
|
||||||
bst.save_model('xgb.model')
|
bst.save_model('xgb.model')
|
||||||
# load model and data in
|
# load model and data in
|
||||||
bst2 = xgb.Booster(model_file='xgb.model')
|
bst2 = xgb.Booster(model_file='xgb.model')
|
||||||
@@ -36,6 +38,14 @@ preds2 = bst2.predict(dtest2)
|
|||||||
# assert they are the same
|
# assert they are the same
|
||||||
assert np.sum(np.abs(preds2-preds)) == 0
|
assert np.sum(np.abs(preds2-preds)) == 0
|
||||||
|
|
||||||
|
# alternatively, you can pickle the booster
|
||||||
|
pks = pickle.dumps(bst2)
|
||||||
|
# load model and data in
|
||||||
|
bst3 = pickle.loads(pks)
|
||||||
|
preds3 = bst2.predict(dtest2)
|
||||||
|
# assert they are the same
|
||||||
|
assert np.sum(np.abs(preds3-preds)) == 0
|
||||||
|
|
||||||
###
|
###
|
||||||
# build dmatrix from scipy.sparse
|
# build dmatrix from scipy.sparse
|
||||||
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
|
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
|
||||||
@@ -44,22 +54,22 @@ row = []; col = []; dat = []
|
|||||||
i = 0
|
i = 0
|
||||||
for l in open('../data/agaricus.txt.train'):
|
for l in open('../data/agaricus.txt.train'):
|
||||||
arr = l.split()
|
arr = l.split()
|
||||||
labels.append( int(arr[0]))
|
labels.append(int(arr[0]))
|
||||||
for it in arr[1:]:
|
for it in arr[1:]:
|
||||||
k,v = it.split(':')
|
k,v = it.split(':')
|
||||||
row.append(i); col.append(int(k)); dat.append(float(v))
|
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||||
i += 1
|
i += 1
|
||||||
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
csr = scipy.sparse.csr_matrix((dat, (row,col)))
|
||||||
dtrain = xgb.DMatrix( csr, label = labels )
|
dtrain = xgb.DMatrix(csr, label = labels)
|
||||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
bst = xgb.train( param, dtrain, num_round, watchlist )
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|
||||||
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
|
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
|
||||||
# we can also construct from csc matrix
|
# we can also construct from csc matrix
|
||||||
csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
|
csc = scipy.sparse.csc_matrix((dat, (row,col)))
|
||||||
dtrain = xgb.DMatrix(csc, label=labels)
|
dtrain = xgb.DMatrix(csc, label=labels)
|
||||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
bst = xgb.train( param, dtrain, num_round, watchlist )
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|
||||||
print ('start running example of build DMatrix from numpy array')
|
print ('start running example of build DMatrix from numpy array')
|
||||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
|
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
|
||||||
@@ -67,6 +77,6 @@ print ('start running example of build DMatrix from numpy array')
|
|||||||
npymat = csr.todense()
|
npymat = csr.todense()
|
||||||
dtrain = xgb.DMatrix(npymat, label = labels)
|
dtrain = xgb.DMatrix(npymat, label = labels)
|
||||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
bst = xgb.train( param, dtrain, num_round, watchlist )
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
30
demo/guide-python/evals_result.py
Normal file
30
demo/guide-python/evals_result.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
##
|
||||||
|
# This script demonstrate how to access the eval metrics in xgboost
|
||||||
|
##
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train', silent=True)
|
||||||
|
dtest = xgb.DMatrix('../data/agaricus.txt.test', silent=True)
|
||||||
|
|
||||||
|
param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
|
||||||
|
|
||||||
|
num_round = 2
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
|
||||||
|
evals_result = {}
|
||||||
|
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)
|
||||||
|
|
||||||
|
print('Access logloss metric directly from evals_result:')
|
||||||
|
print(evals_result['eval']['logloss'])
|
||||||
|
|
||||||
|
print('')
|
||||||
|
print('Access metrics through a loop:')
|
||||||
|
for e_name, e_mtrs in evals_result.items():
|
||||||
|
print('- {}'.format(e_name))
|
||||||
|
for e_mtr_name, e_mtr_vals in e_mtrs.items():
|
||||||
|
print(' - {}'.format(e_mtr_name))
|
||||||
|
print(' - {}'.format(e_mtr_vals))
|
||||||
|
|
||||||
|
print('')
|
||||||
|
print('Access complete dictionary:')
|
||||||
|
print(evals_result)
|
||||||
@@ -2,7 +2,11 @@
|
|||||||
python basic_walkthrough.py
|
python basic_walkthrough.py
|
||||||
python custom_objective.py
|
python custom_objective.py
|
||||||
python boost_from_prediction.py
|
python boost_from_prediction.py
|
||||||
|
python predict_first_ntree.py
|
||||||
python generalized_linear_model.py
|
python generalized_linear_model.py
|
||||||
python cross_validation.py
|
python cross_validation.py
|
||||||
python predict_leaf_indices.py
|
python predict_leaf_indices.py
|
||||||
|
python sklearn_examples.py
|
||||||
|
python sklearn_parallel.py
|
||||||
|
python external_memory.py
|
||||||
rm -rf *~ *.model *.buffer
|
rm -rf *~ *.model *.buffer
|
||||||
|
|||||||
43
demo/guide-python/sklearn_evals_result.py
Normal file
43
demo/guide-python/sklearn_evals_result.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
##
|
||||||
|
# This script demonstrate how to access the xgboost eval metrics by using sklearn
|
||||||
|
##
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.datasets import make_hastie_10_2
|
||||||
|
|
||||||
|
X, y = make_hastie_10_2(n_samples=2000, random_state=42)
|
||||||
|
|
||||||
|
# Map labels from {-1, 1} to {0, 1}
|
||||||
|
labels, y = np.unique(y, return_inverse=True)
|
||||||
|
|
||||||
|
X_train, X_test = X[:1600], X[1600:]
|
||||||
|
y_train, y_test = y[:1600], y[1600:]
|
||||||
|
|
||||||
|
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
|
||||||
|
|
||||||
|
clf = xgb.XGBModel(**param_dist)
|
||||||
|
# Or you can use: clf = xgb.XGBClassifier(**param_dist)
|
||||||
|
|
||||||
|
clf.fit(X_train, y_train,
|
||||||
|
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||||
|
eval_metric='logloss',
|
||||||
|
verbose=True)
|
||||||
|
|
||||||
|
# Load evals result by calling the evals_result() function
|
||||||
|
evals_result = clf.evals_result()
|
||||||
|
|
||||||
|
print('Access logloss metric directly from validation_0:')
|
||||||
|
print(evals_result['validation_0']['logloss'])
|
||||||
|
|
||||||
|
print('')
|
||||||
|
print('Access metrics through a loop:')
|
||||||
|
for e_name, e_mtrs in evals_result.items():
|
||||||
|
print('- {}'.format(e_name))
|
||||||
|
for e_mtr_name, e_mtr_vals in e_mtrs.items():
|
||||||
|
print(' - {}'.format(e_mtr_name))
|
||||||
|
print(' - {}'.format(e_mtr_vals))
|
||||||
|
|
||||||
|
print('')
|
||||||
|
print('Access complete dict:')
|
||||||
|
print(evals_result)
|
||||||
@@ -8,7 +8,7 @@ import pickle
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.cross_validation import KFold
|
from sklearn.cross_validation import KFold, train_test_split
|
||||||
from sklearn.metrics import confusion_matrix, mean_squared_error
|
from sklearn.metrics import confusion_matrix, mean_squared_error
|
||||||
from sklearn.grid_search import GridSearchCV
|
from sklearn.grid_search import GridSearchCV
|
||||||
from sklearn.datasets import load_iris, load_digits, load_boston
|
from sklearn.datasets import load_iris, load_digits, load_boston
|
||||||
@@ -65,3 +65,13 @@ print("Pickling sklearn API models")
|
|||||||
pickle.dump(clf, open("best_boston.pkl", "wb"))
|
pickle.dump(clf, open("best_boston.pkl", "wb"))
|
||||||
clf2 = pickle.load(open("best_boston.pkl", "rb"))
|
clf2 = pickle.load(open("best_boston.pkl", "rb"))
|
||||||
print(np.allclose(clf.predict(X), clf2.predict(X)))
|
print(np.allclose(clf.predict(X), clf2.predict(X)))
|
||||||
|
|
||||||
|
# Early-stopping
|
||||||
|
|
||||||
|
X = digits['data']
|
||||||
|
y = digits['target']
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||||
|
clf = xgb.XGBClassifier()
|
||||||
|
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
||||||
|
eval_set=[(X_test, y_test)])
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ dim(train)
|
|||||||
train[1:6,1:5, with =F]
|
train[1:6,1:5, with =F]
|
||||||
|
|
||||||
# Test dataset dimensions
|
# Test dataset dimensions
|
||||||
dim(train)
|
dim(test)
|
||||||
|
|
||||||
# Test content
|
# Test content
|
||||||
test[1:6,1:5, with =F]
|
test[1:6,1:5, with =F]
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Using XGBoost for regression is very similar to using it for binary classificati
|
|||||||
The dataset we used is the [computer hardware dataset from UCI repository](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware). The demo for regression is almost the same as the [binary classification demo](../binary_classification), except a little difference in general parameter:
|
The dataset we used is the [computer hardware dataset from UCI repository](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware). The demo for regression is almost the same as the [binary classification demo](../binary_classification), except a little difference in general parameter:
|
||||||
```
|
```
|
||||||
# General parameter
|
# General parameter
|
||||||
# this is the only difference with classification, use reg:linear to do linear classification
|
# this is the only difference with classification, use reg:linear to do linear regression
|
||||||
# when labels are in [0,1] we can also use reg:logistic
|
# when labels are in [0,1] we can also use reg:logistic
|
||||||
objective = reg:linear
|
objective = reg:linear
|
||||||
...
|
...
|
||||||
|
|||||||
7
doc/.gitignore
vendored
Normal file
7
doc/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
html
|
||||||
|
latex
|
||||||
|
*.sh
|
||||||
|
_*
|
||||||
|
doxygen
|
||||||
|
parser.py
|
||||||
|
*.pyc
|
||||||
192
doc/Makefile
Normal file
192
doc/Makefile
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
# Makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line.
|
||||||
|
SPHINXOPTS =
|
||||||
|
SPHINXBUILD = sphinx-build
|
||||||
|
PAPER =
|
||||||
|
BUILDDIR = _build
|
||||||
|
|
||||||
|
# User-friendly check for sphinx-build
|
||||||
|
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
|
||||||
|
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Internal variables.
|
||||||
|
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||||
|
PAPEROPT_letter = -D latex_paper_size=letter
|
||||||
|
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
||||||
|
# the i18n builder cannot share the environment and doctrees with the others
|
||||||
|
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
||||||
|
|
||||||
|
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
|
||||||
|
|
||||||
|
help:
|
||||||
|
@echo "Please use \`make <target>' where <target> is one of"
|
||||||
|
@echo " html to make standalone HTML files"
|
||||||
|
@echo " dirhtml to make HTML files named index.html in directories"
|
||||||
|
@echo " singlehtml to make a single large HTML file"
|
||||||
|
@echo " pickle to make pickle files"
|
||||||
|
@echo " json to make JSON files"
|
||||||
|
@echo " htmlhelp to make HTML files and a HTML help project"
|
||||||
|
@echo " qthelp to make HTML files and a qthelp project"
|
||||||
|
@echo " applehelp to make an Apple Help Book"
|
||||||
|
@echo " devhelp to make HTML files and a Devhelp project"
|
||||||
|
@echo " epub to make an epub"
|
||||||
|
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
||||||
|
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
||||||
|
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
|
||||||
|
@echo " text to make text files"
|
||||||
|
@echo " man to make manual pages"
|
||||||
|
@echo " texinfo to make Texinfo files"
|
||||||
|
@echo " info to make Texinfo files and run them through makeinfo"
|
||||||
|
@echo " gettext to make PO message catalogs"
|
||||||
|
@echo " changes to make an overview of all changed/added/deprecated items"
|
||||||
|
@echo " xml to make Docutils-native XML files"
|
||||||
|
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
|
||||||
|
@echo " linkcheck to check all external links for integrity"
|
||||||
|
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
||||||
|
@echo " coverage to run coverage check of the documentation (if enabled)"
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(BUILDDIR)/*
|
||||||
|
|
||||||
|
html:
|
||||||
|
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
||||||
|
|
||||||
|
dirhtml:
|
||||||
|
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
||||||
|
|
||||||
|
singlehtml:
|
||||||
|
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
||||||
|
|
||||||
|
pickle:
|
||||||
|
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can process the pickle files."
|
||||||
|
|
||||||
|
json:
|
||||||
|
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can process the JSON files."
|
||||||
|
|
||||||
|
htmlhelp:
|
||||||
|
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
||||||
|
".hhp project file in $(BUILDDIR)/htmlhelp."
|
||||||
|
|
||||||
|
qthelp:
|
||||||
|
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
||||||
|
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
||||||
|
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
|
||||||
|
@echo "To view the help file:"
|
||||||
|
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
|
||||||
|
|
||||||
|
applehelp:
|
||||||
|
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
|
||||||
|
@echo "N.B. You won't be able to view it unless you put it in" \
|
||||||
|
"~/Library/Documentation/Help or install it in your application" \
|
||||||
|
"bundle."
|
||||||
|
|
||||||
|
devhelp:
|
||||||
|
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
||||||
|
@echo
|
||||||
|
@echo "Build finished."
|
||||||
|
@echo "To view the help file:"
|
||||||
|
@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
|
||||||
|
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
|
||||||
|
@echo "# devhelp"
|
||||||
|
|
||||||
|
epub:
|
||||||
|
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
||||||
|
|
||||||
|
latex:
|
||||||
|
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||||
|
@echo
|
||||||
|
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
||||||
|
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
||||||
|
"(use \`make latexpdf' here to do that automatically)."
|
||||||
|
|
||||||
|
latexpdf:
|
||||||
|
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||||
|
@echo "Running LaTeX files through pdflatex..."
|
||||||
|
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
||||||
|
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||||
|
|
||||||
|
latexpdfja:
|
||||||
|
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||||
|
@echo "Running LaTeX files through platex and dvipdfmx..."
|
||||||
|
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
|
||||||
|
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||||
|
|
||||||
|
text:
|
||||||
|
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
||||||
|
|
||||||
|
man:
|
||||||
|
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
||||||
|
|
||||||
|
texinfo:
|
||||||
|
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
||||||
|
@echo "Run \`make' in that directory to run these through makeinfo" \
|
||||||
|
"(use \`make info' here to do that automatically)."
|
||||||
|
|
||||||
|
info:
|
||||||
|
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||||
|
@echo "Running Texinfo files through makeinfo..."
|
||||||
|
make -C $(BUILDDIR)/texinfo info
|
||||||
|
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
||||||
|
|
||||||
|
gettext:
|
||||||
|
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
||||||
|
|
||||||
|
changes:
|
||||||
|
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
||||||
|
@echo
|
||||||
|
@echo "The overview file is in $(BUILDDIR)/changes."
|
||||||
|
|
||||||
|
linkcheck:
|
||||||
|
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
||||||
|
@echo
|
||||||
|
@echo "Link check complete; look for any errors in the above output " \
|
||||||
|
"or in $(BUILDDIR)/linkcheck/output.txt."
|
||||||
|
|
||||||
|
doctest:
|
||||||
|
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
||||||
|
@echo "Testing of doctests in the sources finished, look at the " \
|
||||||
|
"results in $(BUILDDIR)/doctest/output.txt."
|
||||||
|
|
||||||
|
coverage:
|
||||||
|
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
|
||||||
|
@echo "Testing of coverage in the sources finished, look at the " \
|
||||||
|
"results in $(BUILDDIR)/coverage/python.txt."
|
||||||
|
|
||||||
|
xml:
|
||||||
|
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
|
||||||
|
|
||||||
|
pseudoxml:
|
||||||
|
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
|
||||||
|
@echo
|
||||||
|
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
|
||||||
7
doc/README
Normal file
7
doc/README
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
The document of xgboost is generated with recommonmark and sphinx.
|
||||||
|
|
||||||
|
You can build it locally by typing "make html" in this folder.
|
||||||
|
- clone https://github.com/tqchen/recommonmark to root
|
||||||
|
- type make html
|
||||||
|
|
||||||
|
Checkout https://recommonmark.readthedocs.org for guide on how to write markdown with extensions used in this doc, such as math formulas and table of content.
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user