Compare commits

...

1596 Commits
v0.21 ... v0.40

Author SHA1 Message Date
Tianqi Chen
cb4d7f821f Update README.md 2015-05-11 23:44:02 -07:00
tqchen
42bf52f462 0.4 2015-05-11 23:42:49 -07:00
hetong
755eab8949 update date 2015-05-11 20:58:41 -07:00
hetong
c05cc48dfa delete abundant file 2015-05-11 20:55:09 -07:00
hetong007
cfdd6029a8 rename demo of early stopping 2015-05-11 16:59:18 -07:00
Tong He
d7da4189dc Merge pull request #296 from by321/master
new parameter in xgboost() and xgb.train() to print every N-th progress message
2015-05-11 16:55:14 -07:00
hetong007
90096e718c fix early stopping 2015-05-11 16:53:51 -07:00
hetong007
83ace55f51 add early stopping to xgb.cv 2015-05-11 16:03:40 -07:00
hetong007
60d307c445 add poisson demo 2015-05-11 15:21:54 -07:00
by321
5dacab0e22 new parameter in xgboost() and xgb.train() to print every N-th progress message 2015-05-11 14:18:24 -07:00
Tianqi Chen
9c0ba67088 Update README.md 2015-05-11 08:45:59 -07:00
Tianqi Chen
8b9e87790a Merge pull request #299 from jseabold/pickle-xgbooster
ENH: Pickle xgbooster enhancments. Thanks!
2015-05-11 08:44:36 -07:00
Skipper Seabold
15ea00540a EX: Make separate example for fork issue. 2015-05-11 09:30:51 -05:00
Skipper Seabold
fa8c6e2f0b DOC: Add warning about fork + openmp 2015-05-11 09:09:08 -05:00
Skipper Seabold
99c2df9913 EX: Show example of pickling and parallel use. 2015-05-11 09:09:08 -05:00
Skipper Seabold
932af821c5 CLN: Remove unused import. Fix comment. 2015-05-11 09:09:05 -05:00
Tianqi Chen
08848ab3ee Update README.md 2015-05-10 17:45:20 -07:00
Tianqi Chen
6f56e0f4ef Merge pull request #307 from pommedeterresautee/master
cleaning Rmarkdown
2015-05-10 08:51:42 -07:00
El Potaeto
3104f1f806 wording + presentation Otto rmarkdown 2015-05-10 09:39:21 +02:00
El Potaeto
cebca6846d ref in README 2015-05-10 09:38:48 +02:00
hetong007
d3564f34d5 Merge branch 'master' of github.com:dmlc/xgboost 2015-05-09 18:09:05 -07:00
hetong007
3f9921762a support both early stop name 2015-05-09 18:08:47 -07:00
tqchen
3a534d264d fix wrapper gc bug 2015-05-09 17:39:45 -07:00
tqchen
9a85c108e2 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2015-05-09 17:39:11 -07:00
Tong He
f6fc38f7af Merge pull request #298 from pommedeterresautee/master
Documentation improvement
2015-05-08 15:15:56 -07:00
pommedeterresautee
11ba651a07 Regularization parameters documentation improvement 2015-05-08 16:59:29 +02:00
pommedeterresautee
e92d384a6a small change in the wording of Otto R markdown 2015-05-08 16:29:29 +02:00
tqchen
a4de0ebcd4 change numpy to bytearray as buffer 2015-05-07 18:21:15 -07:00
tqchen
6942980ebb Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-05-07 18:13:29 -07:00
tqchen
68444a0626 fix pkl problem 2015-05-07 18:11:40 -07:00
Tianqi Chen
0af5cfbac3 Merge pull request #291 from pommedeterresautee/master
Rmarkdown improvement
2015-05-07 10:28:40 -07:00
Tianqi Chen
c6c7dc0a93 Update CHANGES.md 2015-05-06 17:11:39 -07:00
Tianqi Chen
2d748fb6fa Update xgboost.py 2015-05-06 16:46:27 -07:00
tqchen
60bf389825 update version to be consistent with python 2015-05-06 16:45:05 -07:00
tqchen
594bed34e4 fix saveraw 2015-05-06 16:42:27 -07:00
tqchen
382dcf6c34 Merge branch 'jseabold-xgb-pickleable' 2015-05-06 16:08:51 -07:00
tqchen
62f938d2b4 Merge branch 'xgb-pickleable' of https://github.com/jseabold/xgboost into jseabold-xgb-pickleable 2015-05-06 16:08:48 -07:00
tqchen
3244f1e9ae Merge branch 'jseabold-xgb-pickleable' 2015-05-06 16:03:36 -07:00
tqchen
76bad1c4cc Merge branch 'xgb-pickleable' of https://github.com/jseabold/xgboost into jseabold-xgb-pickleable 2015-05-06 16:03:24 -07:00
Tong He
ba49f82ace update to 0.4 2015-05-06 15:46:15 -07:00
tqchen
ab6a3b1ee8 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2015-05-06 15:43:22 -07:00
tqchen
7f7947f31c add with pbuffer info to model, allow xgb model to be saved in a more memory compact way 2015-05-06 15:43:15 -07:00
hetong007
993d7b9da3 update roxygen2 2015-05-06 15:23:37 -07:00
hetong007
419e4dbda6 add demo for early_stopping in R 2015-05-06 15:14:29 -07:00
El Potaeto
fd983dfb97 wording 2015-05-07 00:08:45 +02:00
El Potaeto
a985d7dd2b add CSS 2015-05-06 23:31:00 +02:00
Skipper Seabold
13837060f1 ENH: Don't use tempfiles for save/load 2015-05-06 15:02:26 -05:00
Skipper Seabold
11fa419720 ENH: Make XGBModel pickleable. 2015-05-06 12:37:07 -05:00
hetong007
0f182b0b66 fix logic 2015-05-05 16:44:36 -07:00
hetong007
54fb49ee5c add early stopping to R 2015-05-05 16:31:49 -07:00
Tong He
3b4697786e Merge pull request #288 from pommedeterresautee/master
small changes in RMarkdown
2015-05-05 14:58:56 -07:00
El Potaeto
8aa739d374 fix 2015-05-05 23:49:12 +02:00
El Potaeto
5eeec6a33f small changes in RMarkdown 2015-05-05 23:45:43 +02:00
Tong He
937a75bcb1 fix typo 2015-05-05 11:00:49 -07:00
Tong He
c242f9bb66 improve tree graph 2015-05-04 15:25:12 -07:00
Tianqi Chen
a3ad9df0b4 Update understandingXGBoostModel.Rmd 2015-05-04 14:27:44 -07:00
Tong He
2157146cea minor changes 2015-05-04 13:56:45 -07:00
Tianqi Chen
206f3cdbe0 msvc 2015-05-04 11:13:19 -07:00
Tianqi Chen
37d704826a Update parameter.md 2015-05-04 10:51:51 -07:00
tqchen
667a752e04 add poisson regression 2015-05-04 10:48:25 -07:00
tqchen
a310db86a1 new rmarkdown 2015-05-03 14:02:15 -07:00
tqchen
32b1d9d6b0 some minor fix 2015-05-03 13:59:38 -07:00
Tianqi Chen
a8d059902d Merge pull request #283 from pommedeterresautee/master
OTTO Rmarkdown
2015-05-03 09:09:49 -07:00
El Potaeto
1b95df4e54 parameter change in OTTO ramarkdown 2015-05-03 12:57:18 +02:00
El Potaeto
5fa2abee6e wording 2015-05-03 12:55:13 +02:00
El Potaeto
feac425851 trees 2015-05-03 12:52:43 +02:00
El Potaeto
514c5fd447 upgrade DiagrammeR to fix a bug in v 0.5 2015-05-03 12:18:44 +02:00
Tianqi Chen
5b430ee019 Update xgboost.py 2015-05-02 19:29:17 -07:00
Tianqi Chen
8c59c82d92 Merge pull request #282 from ujwlkarn/patch-1
Fixed typos and sentence structure
2015-05-02 09:07:14 -07:00
Ujjwal Karn
897180b2c6 fixed typos and sentence structure 2015-05-02 14:23:33 +05:30
Tianqi Chen
b1f489fd8b Merge pull request #281 from fyears/patch-2
update build instruction in OS X
2015-05-01 23:00:00 -07:00
fyears
5e89943ed0 update build instruction in OS X
`bash xgboost/build.sh` does not work as expected, so `cd` then `build.sh`. And remove the outdated information.
2015-05-01 22:58:53 -07:00
tqchen
5466b36ddb Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-05-01 22:46:22 -07:00
tqchen
7297c2352f Merge commit '7258f3353c8cc3ee3dd3c00c987fa0b189e58723' 2015-05-01 22:46:14 -07:00
tqchen
7258f3353c Squashed 'subtree/rabit/' changes from 24f17df..fa99857
fa99857 try fix warning on some platforms

git-subtree-dir: subtree/rabit
git-subtree-split: fa99857467
2015-05-01 22:46:14 -07:00
tqchen
869c68f149 minor 2015-05-01 22:46:06 -07:00
Tianqi Chen
90b2c0946e Merge pull request #280 from fyears/patch-1
The complete ways to install XGBoost in OS X.
2015-05-01 20:41:58 -07:00
fyears
99eaf771c4 The complete ways to install XGBoost in OS X. 2015-05-01 20:33:38 -07:00
Tianqi Chen
fe32725fa0 Update README.md 2015-05-01 15:58:51 -07:00
Tong He
4ff6697d83 Merge pull request #278 from khotilov/custom_loss_cv_fix
Improved logic in stratified CV
2015-05-01 14:46:05 -07:00
Vadim Khotilovich
c18e081f48 cleanup 2015-05-01 16:16:50 -05:00
Vadim Khotilovich
f05c7d87cb Merge remote branch 'src/master' into custom_loss_cv_fix 2015-05-01 15:42:50 -05:00
Vadim Khotilovich
0a3e7722fd a safeguard against someone using automatic folds creation with ranking 2015-05-01 15:16:30 -05:00
Vadim Khotilovich
f325930bd9 Improved logic in stratified CV to guess class/regr
Somewhat more robust and clear logic in stratified CV to guess classification/regression settings. Allows to accomodate custom objectives (classification is assumed when number of unique values in labels <= 5).
2015-05-01 15:08:08 -05:00
tqchen
2b3b55554f add parameter tunning 2015-05-01 11:41:18 -07:00
tqchen
6f0cbcaf2b add build instruction to doc 2015-05-01 11:12:43 -07:00
Tianqi Chen
8a411150ea Update sparse_batch_page.h 2015-05-01 10:55:42 -07:00
El Potaeto
d74d199a1e small change in the documentation 2015-05-01 13:03:15 +02:00
El Potaeto
962837bab7 OTTO markdown improvement 2015-05-01 13:02:43 +02:00
El Potaeto
52afe1cd7e OTTO markdown 2015-05-01 09:49:04 +02:00
El Potaeto
9f3b02cc3e multiclass documentation 2015-05-01 09:48:07 +02:00
El Potaeto
d860469030 Roxygen update 2015-05-01 09:47:18 +02:00
Tianqi Chen
654aa0b3b5 Update README.md 2015-04-30 15:45:41 -07:00
Tianqi Chen
68d9e7d673 Update README.md 2015-04-30 15:44:27 -07:00
Tong He
bab7b58d94 Merge pull request #227 from khotilov/master
add stratified cross validation for classification
2015-04-30 11:39:52 -07:00
tqchen
188d81d64a Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-04-29 20:25:06 -07:00
tqchen
c77fa7a670 Squashed 'subtree/rabit/' changes from 4fe8d1d..24f17df
24f17df ok

git-subtree-dir: subtree/rabit
git-subtree-split: 24f17df782
2015-04-29 20:23:56 -07:00
tqchen
b2bd79bc76 Merge commit 'c77fa7a670133ac40d6387cc2e958d5fc7cae8c4' 2015-04-29 20:23:56 -07:00
tqchen
18164e677a Squashed 'subtree/rabit/' changes from d1d2ab4..4fe8d1d
4fe8d1d ok io
a5d77ca checkin new dmlc interface

git-subtree-dir: subtree/rabit
git-subtree-split: 4fe8d1d66b
2015-04-29 20:22:11 -07:00
tqchen
32a7c906b4 Merge commit '18164e677af11f8d8be49c3cfb8c3960b9e800fa' 2015-04-29 20:22:11 -07:00
Tianqi Chen
d7846d0ef9 Update README.md 2015-04-28 19:14:32 -07:00
Tianqi Chen
0c7e6327fb Update README.md 2015-04-28 19:13:13 -07:00
Tianqi Chen
d4fcebf8c5 Merge pull request #274 from gitter-badger/gitter-badge
Add a Gitter chat badge to README.md
2015-04-28 19:12:20 -07:00
The Gitter Badger
7b730093a0 Added Gitter badge 2015-04-29 02:11:32 +00:00
Tong He
0de862cdbc Merge pull request #271 from pommedeterresautee/master
Suppress a Note in Cran check
2015-04-28 15:36:33 -07:00
tqchen
afe0a552e0 Squashed 'subtree/rabit/' changes from e1ddcc2..d1d2ab4
d1d2ab4 remove at end

git-subtree-dir: subtree/rabit
git-subtree-split: d1d2ab4599
2015-04-28 10:50:54 -07:00
tqchen
55fe810232 Merge commit 'afe0a552e0689c14c875a0da445e6e417f4ac449' 2015-04-28 10:50:54 -07:00
El Potaeto
0c8b6e2008 Suppress a Note in Cran check 2015-04-28 15:23:23 +02:00
tqchen
e63faf0e85 minor shadow fix 2015-04-27 22:52:19 -07:00
tqchen
2eccdda3c5 strict cstyle pthread 2015-04-27 22:42:01 -07:00
tqchen
279758a92e some strict cxx98 check 2015-04-27 17:37:07 -07:00
hetong007
48bcc021f7 add Rbuildignore to avoid compile .o files 2015-04-27 17:09:47 -07:00
Tianqi Chen
856a18e457 Update README.md 2015-04-27 17:07:58 -07:00
Tianqi Chen
ed901ddbb8 Update README.md 2015-04-27 17:07:28 -07:00
tqchen
69627567da adapt new dmlc io interface 2015-04-27 16:04:14 -07:00
tqchen
1e56ba86d9 Squashed 'subtree/rabit/' changes from fed1683..e1ddcc2
e1ddcc2 Merge branch 'master' of ssh://github.com/dmlc/rabit
6745667 new dmlc io
c5b4610 sge scheduler change

git-subtree-dir: subtree/rabit
git-subtree-split: e1ddcc2eb7
2015-04-27 15:58:57 -07:00
tqchen
59b96cdda5 Merge commit '1e56ba86d9d3e44b14c0a8f5ff71369307dbe86c' 2015-04-27 15:58:57 -07:00
Tianqi Chen
6783b66b9f Merge pull request #269 from jseabold/decode-string-py3
Good, python3 compatibility is indeed something we need to be careful about
2015-04-27 10:45:39 -07:00
Skipper Seabold
ee7e8b6e8a COMPAT: Decode bytes object for Python 3. 2015-04-27 12:41:24 -05:00
Tianqi Chen
f271af488b Merge pull request #267 from jseabold/add-n-classes
Add n_classes_ to fitted XGBClassifier
2015-04-27 09:10:17 -07:00
Skipper Seabold
c1a24c0fb1 ENH: Add n_classes_ to fitted classifier. 2015-04-27 11:09:55 -05:00
Tianqi Chen
8ac89b290e Merge pull request #268 from jseabold/docstrings
DOC: Add docstrings to user-facing classes.
2015-04-27 09:08:56 -07:00
Skipper Seabold
efdbec4d4c DOC: Add docstrings to user-facing classes. 2015-04-27 11:01:46 -05:00
Tianqi Chen
abcc09286c Merge pull request #265 from yzliao/master
add doc for Python wrapper
2015-04-26 22:14:05 -07:00
Yizheng Liao
bb91bdea84 add doc for Python wrapper 2015-04-26 22:08:06 -07:00
Tianqi Chen
94fac1076a bugfix setup 2015-04-26 00:17:58 -07:00
tqchen
d16b2c9670 Squashed 'subtree/rabit/' changes from 27340f9..fed1683
fed1683 minor
c01520f change

git-subtree-dir: subtree/rabit
git-subtree-split: fed1683b9b
2015-04-25 21:24:54 -07:00
tqchen
2eb30e732d Merge commit 'd16b2c9670d1849a360b94d581250aa1796d4abd' 2015-04-25 21:24:54 -07:00
tqchen
b5690e618e Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-04-25 21:20:06 -07:00
tqchen
4abd76386b Merge commit 'c0e0fc0c91dabdb86f68eed78e4a8f2b94fd1c2d' 2015-04-25 21:19:59 -07:00
tqchen
c0e0fc0c91 Squashed 'subtree/rabit/' changes from 82ca10a..27340f9
27340f9 final minor
e03eabc allow win32

git-subtree-dir: subtree/rabit
git-subtree-split: 27340f95e4
2015-04-25 21:19:58 -07:00
Tianqi Chen
6c83a94204 enable msvc win32 project 2015-04-25 21:14:07 -07:00
tqchen
5e63b5d469 Merge commit 'be1c530a0c92701841fa6a427d4f6a53d299cdeb' 2015-04-25 20:52:51 -07:00
tqchen
be1c530a0c Squashed 'subtree/rabit/' changes from c679671..82ca10a
82ca10a better handling at msvc
6601939 Merge pull request #12 from zjf/patch-2
df8f917 Update rabit-inl.h
c60b284 resize during tracker print

git-subtree-dir: subtree/rabit
git-subtree-split: 82ca10acb6
2015-04-25 20:52:51 -07:00
Tianqi Chen
afdebe8d8f fix platform dependent thing 2015-04-25 20:40:43 -07:00
Tianqi Chen
84515cd2a8 fix python windows installation problem, enable mingw compile, but seems mingw dll was not fast in loading 2015-04-25 15:30:42 -07:00
Tianqi Chen
4275434ec5 Merge pull request #260 from dmlc/colopt
Colopt
2015-04-25 10:15:33 -07:00
tqchen
5870b47d76 faster external memory 2015-04-25 10:14:56 -07:00
tqchen
b31d1c4ad9 check in colopt 2015-04-25 09:37:07 -07:00
Tianqi Chen
f28a7a0f8d Merge pull request #254 from lihang00/master
Python: add more params in sklearn wrapper.
2015-04-24 14:17:28 -07:00
HangLi
c6d2e16b61 remove eval_metric 2015-04-24 10:37:20 -07:00
HangLi
0058ebac9a add more params 2015-04-24 08:50:22 -07:00
Tianqi Chen
1d5b4e19a5 Merge pull request #258 from yzliao/master
remove print in Python function get_fscore()
2015-04-24 08:49:47 -07:00
Yizheng Liao
b5c8085638 remove print in Python get_fscore() 2015-04-23 23:40:10 -07:00
Yizheng Liao
84b82ab55f add flag variable in Python get_fscore() to control printing 2015-04-23 22:28:32 -07:00
Tianqi Chen
b94f7b0849 Merge pull request #257 from yzliao/master
Python: record evaluation results in train()
2015-04-23 21:51:09 -07:00
Yizheng Liao
1d8fc6280c correct format 2015-04-23 21:27:12 -07:00
Yizheng Liao
44d1043031 record training progress 2015-04-23 21:24:24 -07:00
HangLi
fcb833373b reorder parameters 2015-04-23 16:25:31 -07:00
Tianqi Chen
4aa1ea2d44 Merge pull request #252 from zjf/master
Fix a typo in comment
2015-04-23 14:37:26 -07:00
Tianqi Chen
dcb7ac81c1 Merge pull request #253 from tcfuji/master
Update README.md
2015-04-23 14:37:13 -07:00
HangLi
29e76c7ac0 add more params in sklearn wrapper. 2015-04-23 11:34:59 -07:00
Ted
7d3b51b873 Update README.md
Ensures OpenMP support
2015-04-23 14:08:39 -04:00
Jianfeng Zhu
11c45e5c60 Merge pull request #1 from zjf/zjf-patch-1
Update data.h
2015-04-23 14:22:10 +08:00
Jianfeng Zhu
f8ce8899bd Update data.h
Fix a minor typo, which may cause unnecessary confusion.
2015-04-23 14:21:05 +08:00
Tianqi Chen
e2c0ecbc92 Merge pull request #251 from zjf/patch-1
Update updater.h
2015-04-22 20:50:00 -07:00
Jianfeng Zhu
78907ca08d Update updater.h
Fix minor type
2015-04-23 11:44:47 +08:00
Tianqi Chen
d3af4e138f Merge pull request #249 from yzliao/master
add default value of gamma in parameter.md
2015-04-22 17:07:15 -07:00
Yizheng Liao
1b22ab7a7e add default value of gamma in parameter.md 2015-04-22 16:52:02 -07:00
Tianqi Chen
263d9bf84f Update README.md 2015-04-21 20:59:03 -07:00
tqchen
3e03c66e8a add note about distributed version 2015-04-20 12:37:23 -07:00
tqchen
0461231d3d more capacity for base 2015-04-20 16:21:55 +00:00
tqchen
dfec406afd half ram support 2015-04-19 21:29:13 -07:00
tqchen
5ad1555daf fix links to wiki 2015-04-19 14:23:47 -07:00
Tianqi Chen
a68928579b Update README.md 2015-04-19 14:21:12 -07:00
tqchen
50c1ce950f final chg 2015-04-19 14:07:39 -07:00
tqchen
315299aea8 add highlights 2015-04-19 14:07:08 -07:00
tqchen
6f14405b09 fix doc 2015-04-19 14:05:33 -07:00
tqchen
0220a22ca4 chg docs 2015-04-19 13:58:46 -07:00
tqchen
a1fdff0522 ok 2015-04-19 13:52:22 -07:00
tqchen
c6c868449c move documentation to repo 2015-04-19 13:48:19 -07:00
tqchen
5b042691b0 chg docs 2015-04-19 01:00:37 -07:00
Tianqi Chen
54a78b87dc Merge pull request #245 from dmlc/lite
Lite
2015-04-19 00:56:10 -07:00
tqchen
5123b07d73 add more docs 2015-04-19 00:55:11 -07:00
tqchen
44fd329b02 Squashed 'subtree/rabit/' changes from f52daf9..c679671
c679671 fix io style

git-subtree-dir: subtree/rabit
git-subtree-split: c67967161e
2015-04-19 00:23:02 -07:00
tqchen
ee112353cb Merge commit '44fd329b021bfd46a6b033a64467cda7d40310db' into lite 2015-04-19 00:23:02 -07:00
Tianqi Chen
18277086d9 fix windows warnings 2015-04-19 00:20:52 -07:00
tqchen
9527b55f35 fix makefile 2015-04-19 00:05:56 -07:00
tqchen
20da8bbe50 Squashed 'subtree/rabit/' changes from 7568f75..f52daf9
f52daf9 make timer cross platform

git-subtree-dir: subtree/rabit
git-subtree-split: f52daf9be1
2015-04-19 00:05:15 -07:00
tqchen
eb7cccffa4 Merge commit '20da8bbe504c0b81f6f3aff5b23f5bc3ee97d3f4' into lite 2015-04-19 00:05:15 -07:00
Bing Xu
47ee5e7c14 Update README.md 2015-04-18 14:46:00 -06:00
tqchen
5dfab4ba70 fast loader 2015-04-17 23:02:30 -07:00
tqchen
6d9cb3a2fa Merge branch 'lite' of ssh://github.com/tqchen/xgboost into lite
Conflicts:
	src/io/page_dmatrix-inl.hpp
2015-04-17 22:10:56 -07:00
tqchen
0a7d233c5d add 2015-04-17 22:09:26 -07:00
tqchen
788785f164 faster libsvm parser 2015-04-17 22:07:59 -07:00
tqchen
6bc5d6f0b4 Squashed 'subtree/rabit/' changes from 3bf8661..7568f75
7568f75 new io interface

git-subtree-dir: subtree/rabit
git-subtree-split: 7568f75f45
2015-04-17 21:07:33 -07:00
tqchen
c528c1e8e6 Merge commit '6bc5d6f0b44b957cc9f0d0b1fe5d420b0b59b8e2' into lite 2015-04-17 21:07:33 -07:00
tqchen
ddb7e538df OK 2015-04-16 17:03:18 -07:00
tqchen
22abf4e295 need more check 2015-04-16 12:34:39 -07:00
tqchen
a514340c96 current progress 2015-04-15 22:28:43 -07:00
tqchen
e8f6f3b541 some initial try of cachefiles 2015-04-15 15:15:23 -07:00
tqchen
3d8431fc5c simplify and parallelize data builder 2015-04-15 13:42:03 -07:00
Tianqi Chen
a596d11ed1 Merge pull request #241 from pommedeterresautee/master
Add experimental RF parameter documentation
2015-04-15 10:15:41 -07:00
El Potaeto
a49150a6d2 Redo readme modification 2015-04-15 18:49:52 +02:00
El Potaeto
de3f74f755 Merge remote-tracking branch 'dmlc/master' 2015-04-15 18:48:26 +02:00
El Potaeto
e4c8d9d2e1 clean 2015-04-15 18:47:31 +02:00
El Potaeto
511d74c631 clean 2015-04-15 18:46:28 +02:00
El Potaeto
ab8cf14fb9 cleaning 2015-04-15 18:44:06 +02:00
El Potaeto
0ae6d470c7 test 2015-04-15 18:36:53 +02:00
El Potaeto
925fa30316 Cancel readme modif 2015-04-15 18:32:04 +02:00
El Potaeto
2034b91b7d commit emtpy 2015-04-15 18:30:46 +02:00
pommedeterresautee
20dfcd7cec Add slides to readme + group documentation together 2015-04-14 00:48:11 +02:00
pommedeterresautee
12047056ae Update vignette 2015-04-14 00:39:51 +02:00
pommedeterresautee
4e1002a52c Experimental parameter 2015-04-14 00:30:55 +02:00
pommedeterresautee
aa0f612ac9 git ignore RProject files 2015-04-14 00:26:11 +02:00
tqchen
2b7c35870f Squashed 'subtree/rabit/' changes from 18f4d6c..3bf8661
3bf8661 add std before basic

git-subtree-dir: subtree/rabit
git-subtree-split: 3bf8661ec1
2015-04-13 13:44:41 -07:00
tqchen
6370b38c14 Merge commit '2b7c35870f7bf0ca7e28f53b322829007c91317e' 2015-04-13 13:44:41 -07:00
tqchen
24207d96fe new dmlc interface 2015-04-11 20:28:50 -07:00
tqchen
a30045c7cc Squashed 'subtree/rabit/' changes from 50a66b3..18f4d6c
18f4d6c remove rabit learn
bcfbe51 fix dmlc io
ad383b0 ok
3b8c04a Merge branch 'master' of ssh://github.com/dmlc/rabit
9dd97cc keepup with dmlc core
ef13aaf ch

git-subtree-dir: subtree/rabit
git-subtree-split: 18f4d6c0ba
2015-04-11 20:26:57 -07:00
tqchen
f55f8f023f Merge commit 'a30045c7cc54344e2084fb1fa3e01bfafc737188' 2015-04-11 20:26:57 -07:00
tqchen
bf7b750b86 add ignore 2015-04-11 09:25:19 -07:00
tqchen
91a7a5f2e2 add small boundary checking 2015-04-10 10:55:42 -07:00
Tianqi Chen
0ea28c35c4 Merge pull request #225 from chrissly31415/master
Fixing parsing of model dump text file in R
2015-04-09 09:53:38 -07:00
Tianqi Chen
7975dd03a9 Merge pull request #229 from nagadomi/fix_group_check_in_r
Fix length check in utils.R
2015-04-09 09:02:31 -07:00
tqchen
f4dbee5523 Squashed 'subtree/rabit/' changes from e08542c..50a66b3
50a66b3 fix empty engine

git-subtree-dir: subtree/rabit
git-subtree-split: 50a66b3855
2015-04-09 08:45:13 -07:00
tqchen
73ab391309 Merge commit 'f4dbee5523dc5816480f3c97cdb7192ceaec9dfc' 2015-04-09 08:45:13 -07:00
tqchen
c8c1dc6a3b xgboost update for dmlc changes 2015-04-08 17:42:54 -07:00
tqchen
3d11f56880 Squashed 'subtree/rabit/' changes from b15f6cd..e08542c
e08542c fix doc
e95c962 remove I prefix from interface, serializable now takes in pointer

git-subtree-dir: subtree/rabit
git-subtree-split: e08542c635
2015-04-08 17:39:45 -07:00
tqchen
9a6adb0f33 Merge commit '3d11f56880521c1d45504c965ae12886e9b72ace' 2015-04-08 17:39:45 -07:00
Tianqi Chen
23c273173f Merge pull request #230 from jseabold/python-install
Make the Python wrappers installable without path munging
2015-04-08 15:02:37 -07:00
Tong He
2c9631a254 Merge pull request #228 from khotilov/dep_reduction__mv2suggest
dependencies trim: moved external graphing packages to Suggests
2015-04-08 13:26:53 -07:00
Skipper Seabold
a0e07f16c4 Update demo scripts to use installed python library 2015-04-08 14:22:54 -05:00
Skipper Seabold
ceb62e9231 Update docs about python module install 2015-04-08 14:20:52 -05:00
Skipper Seabold
c972feb4b5 Make Python package installable. 2015-04-08 14:07:37 -05:00
nagadomi
87b4332cc1 Fix length check in utils.R 2015-04-09 02:25:47 +09:00
Vadim Khotilovich
76cef701ab moved the external graphing packages to Suggested in order to trim the dependencies 2015-04-07 18:02:29 -05:00
Vadim Khotilovich
aefd234da3 moved the external graphing packages to Suggested in order to trim the dependencies 2015-04-07 17:43:53 -05:00
Vadim Khotilovich
0405676734 Merge remote branch 'src/master' 2015-04-07 17:16:19 -05:00
Tianqi Chen
e91bacd378 Merge pull request #226 from white1033/master 2015-04-07 09:23:11 -07:00
white1033
b4545df0e3 *Fix Sklearn.grid_search error 2015-04-07 23:57:01 +08:00
chrissly31415
34cbbab84c fixing parsing of any numbers 2015-04-07 11:45:08 +02:00
chrissly31415
b39c16ea02 fixed parsing of negative reals, integers and scientific notation which
can occur in model dump
2015-04-07 10:57:54 +02:00
tqchen
01771c813d safe fix 2015-04-06 14:53:40 -07:00
tqchen
99f8dd280e push backward compatible fix 2015-04-06 14:50:21 -07:00
tqchen
36dcb061a8 larger boundary in edge case 2015-04-06 13:42:43 -07:00
tqchen
dc37023226 fix 2015-04-06 09:59:18 -07:00
tqchen
65abc26797 move distributed xgboost to wormhole 2015-04-06 09:56:45 -07:00
tqchen
421f5c6570 fix 2015-04-06 09:00:27 -07:00
tqchen
3cc48d6707 fix crash in error 2015-04-06 08:58:33 -07:00
tqchen
b6d85b9d9b fix label crash 2015-04-06 08:48:06 -07:00
tqchen
529a732737 add label error 2015-04-06 08:45:54 -07:00
tqchen
30e61084eb Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-04-05 20:42:27 -07:00
tqchen
0ffaeb8c64 add xgboost 2015-04-05 20:42:09 -07:00
Tianqi Chen
84957c3f84 update windows project for latest change 2015-04-05 20:13:20 -07:00
tqchen
8a3c0f1ae4 simple chg 2015-04-05 12:16:55 -07:00
tqchen
b8fd7c3c7c add instruction to build with s3 2015-04-05 12:10:59 -07:00
tqchen
fba9e5c714 quick fix 2015-04-05 12:01:19 -07:00
tqchen
5f902982f2 compile with dmlc 2015-04-05 11:26:06 -07:00
tqchen
89244b4aec Squashed 'subtree/rabit/' changes from 16975b4..b15f6cd
b15f6cd rabit unifires with dmlc
5634ec3 ok
2dd6c2f Merge branch 'master' of ssh://github.com/dmlc/rabit
38d7f99 checkin wormhole spliter
8acb96a Merge pull request #10 from ryanzz/master
911a1f0 fixed a mistake
732d8c3 inteface changing
684ea0a inteface changing
8cb4c02 add dmlc support
be2ff70 allow adapting wormhole

git-subtree-dir: subtree/rabit
git-subtree-split: b15f6cd2ac
2015-04-05 09:56:53 -07:00
tqchen
9b7907eda3 Merge commit '89244b4aec1f229b9ba1378389d4dea697389666' 2015-04-05 09:56:53 -07:00
Tianqi Chen
e626b62daa Merge pull request #220 from white1033/master
*Fix XGBClassifier super()
2015-04-05 09:05:08 -07:00
white1033
18cb8d7de2 fix indent warning by flake8 2015-04-05 23:22:40 +08:00
white1033
402e832ce5 *Fix XGBClassifier super() 2015-04-05 21:15:09 +08:00
Vadim Khotilovich
31b0e53cd4 make it possible to use a list of pre-defined CV folds in xgb.cv 2015-04-03 13:24:04 -05:00
Vadim Khotilovich
c03b42054f Merge remote branch 'src/master' 2015-04-03 13:18:40 -05:00
Vadim Khotilovich
271e8202a7 force xgb.cv to return numeric performance values instead of character; update its docs 2015-04-03 12:20:34 -05:00
Vadim Khotilovich
b04920d8e7 update documentation for xgb.cv 2015-04-03 11:14:09 -05:00
Tianqi Chen
93d3f4fe61 Merge pull request #217 from nerdcha/master
Bugfix for multiclass sklearn wrapper
2015-04-02 21:14:21 -07:00
Jamie Hall
d17cdd639f bugfix 2015-04-02 20:33:07 -07:00
Vadim Khotilovich
611d69c771 fix some wording 2015-04-02 19:59:06 -05:00
Vadim Khotilovich
b8711226e2 added an option for stratified CV to xgb.cv 2015-04-02 19:48:23 -05:00
Tianqi Chen
9b0dee986f Merge pull request #212 from zygmuntz/master
Early stopping for Python wrapper
2015-04-02 17:31:44 -07:00
Tianqi Chen
e9c95645a3 Merge pull request #215 from nerdcha/master
Scikit-Learn Wrapper For XGBoost
2015-04-02 12:25:55 -07:00
Zygmunt Zając
d7f9499f88 early_stopping_rounds for train() in Python wrapper 🔥 2015-04-02 19:43:30 +02:00
Jamie Hall
a1a427af37 Fix some stuff 2015-04-02 00:05:14 -07:00
Jamie Hall
136e902fb2 Initial commit 2015-04-01 23:29:05 -07:00
tqchen
8d1f4a40a5 Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-03-30 16:06:18 -07:00
tqchen
49e641012f add objective 2015-03-30 16:05:51 -07:00
Zygmunt Zając
39093bc432 early stopping for Python wrapper 2015-03-30 19:59:09 +02:00
Zygmunt Zając
7994858697 early stopping for Python wrapper 2015-03-30 19:58:25 +02:00
Zygmunt Zając
f9e157011f early stopping for Python wrapper 2015-03-30 19:56:03 +02:00
unknown
431277d5ca fix multi cv pred 2015-03-29 00:02:29 -07:00
unknown
37567e440c optim pred in cv 2015-03-28 23:41:19 -07:00
unknown
930497e271 fix matrix form prediction 2015-03-28 23:03:16 -07:00
El Potaeto
be6bd3859d Add Random Forest parameter (num_parallel_tree) in function doc + example in Vignette. 2015-03-29 01:52:26 +01:00
Tianqi Chen
b04591cbfc Update README.md 2015-03-28 08:58:30 -07:00
tqchen
68c2aaa7fe Squashed 'subtree/rabit/' changes from eb1f4a4..16975b4
16975b4 try pass on tokens during application submission

git-subtree-dir: subtree/rabit
git-subtree-split: 16975b447c
2015-03-27 11:09:38 -07:00
tqchen
135d461c40 Merge commit '68c2aaa7fe8c1f4688cef2ace67642e85fd1c9d2' 2015-03-27 11:09:38 -07:00
tqchen
0c349d6101 Squashed 'subtree/rabit/' changes from 59e63bc..eb1f4a4
eb1f4a4 change auto to ip

git-subtree-dir: subtree/rabit
git-subtree-split: eb1f4a4003
2015-03-26 23:33:41 -07:00
tqchen
38911fe2b2 Merge commit '0c349d6101652836f2ec23e48f94b4137aac6108' 2015-03-26 23:33:41 -07:00
tqchen
4eae8e8676 allow xgb.load re-use raw information if necessary 2015-03-26 16:54:29 -07:00
tqchen
98618646f6 bugfix booster.check 2015-03-26 16:43:01 -07:00
tqchen
23e46b7fa5 add max_delta_step 2015-03-26 09:47:16 -07:00
tqchen
149b43a0a8 Merge branch 'master' of ssh://github.com/dmlc/xgboost 2015-03-25 21:08:29 -07:00
tqchen
a84d6c55b3 more detailed explaination on windows build 2015-03-25 21:08:21 -07:00
Tong He
db0b06d19c add another solution to os x 2015-03-25 17:14:14 -07:00
hetong007
047c4b20de remove additional files 2015-03-25 16:06:51 -07:00
tqchen
08fb205102 cap second order gradient 2015-03-25 12:08:53 -07:00
tqchen
53c9a7b66b fix quantile for edge case, make logloss evaluation capped for extreme values 2015-03-24 23:52:42 -07:00
tqchen
d53e642b5d add debuglog for quantile 2015-03-23 21:17:50 -07:00
Tianqi Chen
da3a376384 Merge pull request #203 from pommedeterresautee/master
update links dmlc
2015-03-22 09:34:09 -07:00
El Potaeto
7d0ac3a3dd update links dmlc 2015-03-22 16:41:05 +01:00
tqchen
70045c41f9 change links 2015-03-21 23:12:55 -07:00
Tong He
03911cf748 Update README.md 2015-03-21 22:34:19 -07:00
Tianqi Chen
1a9a3a2fd0 Update README.md 2015-03-21 22:26:59 -07:00
Tianqi Chen
87741bded6 Update README.md 2015-03-21 22:26:24 -07:00
Tianqi Chen
25266796e9 Merge pull request #201 from pommedeterresautee/master
add video tuto to the README
2015-03-21 22:23:52 -07:00
tqchen
9ccbeaa8f0 Merge commit '75bf97b57539e5572e7ae8eba72bac6562c63c07'
Conflicts:
	subtree/rabit/rabit-learn/io/line_split-inl.h
	subtree/rabit/yarn/build.sh
2015-03-21 00:48:34 -07:00
tqchen
75bf97b575 Squashed 'subtree/rabit/' changes from 091634b..59e63bc
59e63bc minor
6233050 ok
14477f9 add namenode
75a6d34 add libhdfs opts
e3c76bf minmum fix
8b3c435 chg
2035799 test code
7751b2b add debug
7690313 ok
bd346b4 ok
faba1dc add testload
6f7783e add testload
e5f0340 ok
3ed9ec8 chg
e552ac4 ask for more ram in am
b2505e3 only stop nm when sucess
bc696c9 add queue info
f3e867e add option queue
5dc843c refactor fileio
cd9c81b quick fix
1e23af2 add virtual destructor to iseekstream
f165ffb fix hdfs
8cc6508 allow demo to pass in env
fad4d69 ok
0fd6197 fix more
7423837 fix more
d25de54 add temporal solution, run_yarn_prog.py
e5a9e31 final attempt
ed3bee8 add command back
0774000 add hdfs to resource
9b66e7e fix hadoop
6812f14 ok
08e1c16 change hadoop prefix back to hadoop home
d6b6828 Update build.sh
146e069 bugfix: logical boundary for ring buffer
19cb685 ok
4cf3c13 Merge branch 'master' of ssh://github.com/tqchen/rabit
20daddb add tracker
c57dad8 add ringbased passing and batch schedule
295d8a1 update
994cb02 add sge
014c866 OK

git-subtree-dir: subtree/rabit
git-subtree-split: 59e63bc135
2015-03-21 00:44:31 -07:00
Tong He
5648bec8a3 Update utils.R 2015-03-20 22:41:47 -07:00
hetong007
7ced224722 change name 2015-03-20 18:46:52 -07:00
Tong He
2e71d2dfe4 Update readme.md 2015-03-20 16:05:36 -07:00
hetong007
4bcc73f0c9 add kaggle otto folder 2015-03-20 13:34:20 -07:00
Tong He
f6722ba628 Update utils.R 2015-03-20 11:06:01 -07:00
El Potaeto
3777ad8f17 Merge remote-tracking branch 'upstream/master' 2015-03-20 10:16:48 +01:00
El Potaeto
2b24697d79 add tuto to the README 2015-03-20 10:14:38 +01:00
tqchen
360cc7118d fix cxx11 2015-03-19 11:53:55 -07:00
tqchen
e1538ae615 add new evaluation metric mlogloss for multi-class classification logloss 2015-03-19 11:34:38 -07:00
Tong He
8025b338a8 Merge pull request #199 from pommedeterresautee/master
Cross validation documentation improvement
2015-03-18 11:14:36 -07:00
pommedeterresautee
4094039ce5 README 2015-03-17 23:32:52 +01:00
pommedeterresautee
33205d1fbd Cross validation documentation improvement 2015-03-17 23:18:00 +01:00
Tong He
adfa023822 Merge pull request #198 from pommedeterresautee/master
Add new nrow function for xgb.DMatrix + small function doc changes
2015-03-17 12:29:00 -07:00
Tong He
a146f0c5e1 Update utils.R 2015-03-16 23:23:22 -07:00
Tong He
1e001f7cf3 add length check 2015-03-16 23:20:31 -07:00
pommedeterresautee
240c314ac0 doc 2015-03-16 00:12:23 +01:00
pommedeterresautee
9d1d76532d documentation 2015-03-16 00:10:18 +01:00
pommedeterresautee
6ca76fe784 doc 2015-03-15 23:59:28 +01:00
pommedeterresautee
81caba5dce new nrow function for xgb.DMatrix 2015-03-15 23:52:00 +01:00
pommedeterresautee
cdfa78a3b9 small changes in doc 2015-03-15 23:51:26 +01:00
tqchen
8386c2b7fa check r 2015-03-13 23:49:56 -07:00
Tianqi Chen
2159d18f0b Update param.h 2015-03-13 23:23:23 -07:00
Tianqi Chen
90ade3bb84 Merge pull request #193 from pommedeterresautee/master
Vignette text (very biiiiig change)
2015-03-13 14:50:49 -07:00
El Potaeto
93a019d174 code simplification 2015-03-12 23:44:08 +01:00
El Potaeto
09091884be Merge remote-tracking branch 'upstream/master' 2015-03-11 22:14:35 +01:00
tqchen
e52de85e59 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2015-03-11 11:22:56 -07:00
tqchen
12528c535a fix 2015-03-11 11:22:51 -07:00
tqchen
03f34824b4 some potential fix 2015-03-11 09:43:42 -07:00
tqchen
8437e43afc pass solaris compile 2015-03-11 09:15:34 -07:00
tqchen
52fe528615 fix rpack 2015-03-11 08:53:57 -07:00
Tong He
8f24f3cd5a Update speedtest.R 2015-03-10 22:55:48 -07:00
Tianqi Chen
d5303af068 fix vs warnings 2015-03-09 22:37:08 -07:00
tqchen
13a319ca01 Squashed 'subtree/rabit/' changes from d558f6f..091634b
091634b fix

git-subtree-dir: subtree/rabit
git-subtree-split: 091634b259
2015-03-09 14:58:23 -07:00
tqchen
5c389ed89a Merge commit '13a319ca01e6fadd0ec7592cff8e7b545af0994e' 2015-03-09 14:58:23 -07:00
tqchen
deceec3e10 update 2015-03-09 14:57:49 -07:00
tqchen
8f7e9abf89 Merge commit '4c060df2f17405dc26dc65a77e412d5c2a23525a'
Conflicts:
	subtree/rabit/tracker/rabit_yarn.py
2015-03-09 14:45:23 -07:00
tqchen
4c060df2f1 Squashed 'subtree/rabit/' changes from 28ca7be..d558f6f
d558f6f redefine distributed means
c8efc01 more complicated yarn script

git-subtree-dir: subtree/rabit
git-subtree-split: d558f6f550
2015-03-09 14:44:42 -07:00
tqchen
a8d5af39fd move stream to rabit part, support rabit on yarn 2015-03-09 14:43:46 -07:00
tqchen
57b5d7873f Squashed 'subtree/rabit/' changes from d4ec037..28ca7be
28ca7be add linear readme
ca4b20f add linear readme
1133628 add linear readme
6a11676 update docs
a607047 Update build.sh
2c1cfd8 complete yarn
4f28e32 change formater
2fbda81 fix stdin input
3258bcf checkin yarn master
67ebf81 allow setup from env variables
9b6bf57 fix hdfs
395d5c2 add make system
88ce767 refactor io, initial hdfs file access need test
19be870 chgs
a1bd3c6 Merge branch 'master' of ssh://github.com/tqchen/rabit
1a573f9 introduce input split
29476f1 fix timer issue

git-subtree-dir: subtree/rabit
git-subtree-split: 28ca7becbd
2015-03-09 13:28:38 -07:00
tqchen
9f7c6fe271 Merge commit '57b5d7873f4f0953357e9d98e9c60cff8373d7ec' 2015-03-09 13:28:38 -07:00
El Potaeto
21a4a32655 Vignette text 2015-03-08 21:57:31 +01:00
Tong He
66cf88f7b0 Merge pull request #192 from pommedeterresautee/master
Vignette improvement
2015-03-08 10:08:33 -07:00
tqchen
99ef34ca8c Merge branch 'master' of ssh://github.com/tqchen/xgboost 2015-03-08 09:55:40 -07:00
tqchen
e79840e620 fix wrapper checkNAN 2015-03-08 09:52:59 -07:00
El Potaeto
09e466764e Vignette text 2015-03-08 00:38:22 +01:00
El Potaeto
05dbc40186 space 2015-03-08 00:03:40 +01:00
El Potaeto
5a59c0b26c df spell 2015-03-08 00:02:14 +01:00
Tianqi Chen
2ec27679eb Merge pull request #190 from pommedeterresautee/master
trademark RF
2015-03-07 08:58:50 -08:00
tqchen
d202d8b977 more robust config parser 2015-03-07 08:52:56 -08:00
tqchen
bae1a08c9b remove mock from default build 2015-03-06 21:02:22 -08:00
El Potaeto
5bc9642d31 trademark RF 2015-03-04 12:09:50 +01:00
tqchen
39cb9d2c5e fix nan 2015-03-03 22:33:03 -08:00
hetong
841d076f20 change version of the package 2015-03-03 18:14:25 -08:00
tqchen
e50fa9e78f fix solaris 2015-03-03 13:16:20 -08:00
tqchen
ef2de29f06 Squashed 'subtree/rabit/' changes from 4db0a62..d4ec037
d4ec037 fix rabit
6612fcf Merge branch 'master' of ssh://github.com/tqchen/rabit
d29892c add mock option statis
4fa054e new tracker
75c647c update tracker for host IP
e4ce8ef add hadoop linear example
76ecb4a add hadoop linear example
2e1c4c9 add hadoop linear example

git-subtree-dir: subtree/rabit
git-subtree-split: d4ec037f2e
2015-03-03 13:13:21 -08:00
tqchen
3897b7bf99 Merge commit 'ef2de29f068c0b22a4fb85ca556b7b77950073d6' 2015-03-03 13:13:21 -08:00
tqchen
9fd8612700 fix cranchecks 2015-03-03 12:37:29 -08:00
hetong
ee6e8279eb add vcd back 2015-03-03 00:25:30 -08:00
hetong
41b080e35f To submit to CRAN we cannot use more than 2 threads in our examples/vignettes 2015-03-03 00:21:24 -08:00
Tong He
87ec48c1d3 change order of sentences
Dear Prof. Ripley said that "The Description field should not start with the package name, 'This package' or similar."
2015-03-02 22:45:49 -08:00
Tong He
aa60c44b25 Merge pull request #186 from pommedeterresautee/master
Presentation (CSS) : more space + more structure
2015-03-02 09:55:23 -08:00
El Potaeto
0c77726b55 CSS: Add slight line after Header 1 2015-03-02 14:47:00 +01:00
El Potaeto
a6a707f23c Add ref. 2015-03-02 14:37:25 +01:00
El Potaeto
4ee43f2167 CSS improvement, more space, change in style titles 2015-03-02 14:36:19 +01:00
Tong He
c62583bb0f Update discoverYourData.Rmd 2015-03-01 22:15:47 -08:00
Tong He
48deb49ba1 possible polishments 2015-03-01 22:02:23 -08:00
Tong He
57972ef2c2 Update xgboost.Rnw 2015-03-01 21:32:59 -08:00
tqchen
4210f9cf51 add conf 2015-03-01 20:41:26 -08:00
Tong He
576b8acfae Update xgboostPresentation.Rmd 2015-03-01 18:30:49 -08:00
Tong He
b8c0d8ba72 Merge pull request #185 from pommedeterresautee/master
Vignette improvement: more structure, more serious, less spell/grammar issues, better organization
2015-03-01 18:28:58 -08:00
El Potaeto
de6bedc7cb Vignette text 2015-03-01 21:35:36 +01:00
El Potaeto
711fb128cd Vignette text 2015-03-01 21:31:42 +01:00
El Potaeto
d88cf20c23 Vignette text 2015-03-01 21:25:14 +01:00
El Potaeto
a749cf3133 Vignette text 2015-03-01 21:22:26 +01:00
pommedeterresautee
46082a54c9 Vignette text 2015-03-01 13:01:42 +01:00
pommedeterresautee
8e52c4b45a Fix Vignette bug! 2015-03-01 12:13:38 +01:00
pommedeterresautee
4559477d63 text vignette 2015-03-01 11:01:03 +01:00
pommedeterresautee
2986d913ed Vignette text 2015-03-01 10:20:41 +01:00
hetong
8f0e99c3ce import vcd to eliminate note 2015-02-28 10:11:44 -08:00
Tong He
a96ac937f8 Merge pull request #184 from pommedeterresautee/master
fix warning
2015-02-26 16:01:58 -08:00
pommedeterresautee
8abd9c747a fix warning 2015-02-27 00:49:20 +01:00
Tianqi Chen
9784c471d5 Update README.md 2015-02-25 10:05:50 -08:00
Tianqi Chen
2c69a17e77 Update README.md 2015-02-25 10:00:52 -08:00
Tong He
8e93b18555 Merge pull request #182 from pommedeterresautee/master
Memory optimization in co occurence comp feature importance (use sparse Matrix if required) + Vignette text (spell, grammar...) + CSS
2015-02-23 13:19:34 -08:00
El Potaeto
56068b5453 text vignette 2015-02-22 00:17:37 +01:00
El Potaeto
56e9bff11f Vignette txt 2015-02-21 23:49:41 +01:00
El Potaeto
48390bdd6a text 2015-02-19 19:26:39 +01:00
El Potaeto
56877338b7 memory optimization 2015-02-19 13:48:39 +01:00
Tong He
dce522d7a1 Merge pull request #179 from pommedeterresautee/master
Generalize co-occurence count to not categorical feature only + Perf + Vignette + CSS + Function documentation
2015-02-18 16:55:40 -08:00
El Potaeto
815789bed6 fix 2015-02-19 00:16:50 +01:00
El Potaeto
d982f2746c small fixes 2015-02-18 19:41:13 +01:00
El Potaeto
83ddbbf03b splell 2015-02-18 17:14:08 +01:00
El Potaeto
8523fb9f49 avoid error message 2015-02-18 13:44:21 +01:00
El Potaeto
dabb0fd4c0 Merge remote-tracking branch 'upstream/master' 2015-02-18 13:25:15 +01:00
El Potaeto
f57f0f2543 Documentation feature importance 2015-02-18 13:19:39 +01:00
El Potaeto
8fd546ab3c vignette text 2015-02-18 13:13:27 +01:00
El Potaeto
1cfa810edb refix 2015-02-17 23:37:56 +01:00
El Potaeto
fe4f73920b Merge remote-tracking branch 'origin/master'
Conflicts:
	R-package/vignettes/discoverYourData.Rmd
	R-package/vignettes/vignette.css
2015-02-17 23:35:52 +01:00
El Potaeto
412a6e1085 Add comments 2015-02-17 23:30:36 +01:00
El Potaeto
08493c2b3d missing feature management 2015-02-17 23:27:02 +01:00
El Potaeto
d4731e7b29 vignette text 2015-02-17 23:06:09 +01:00
El Potaeto
2ea6fd9931 better CSS 2015-02-17 23:01:48 +01:00
El Potaeto
e2b2c21aef better co occurence function 2015-02-17 22:39:38 +01:00
pommedeterresautee
2e391ed0ee text refactor 2015-02-16 22:43:12 +01:00
pommedeterresautee
8e3c25ed33 css improvement 2015-02-16 22:35:01 +01:00
Tianqi Chen
15562126a6 Merge pull request #178 from aldanor/master
[python] Fixed the dll import for relative paths + various cleanup.
2015-02-16 09:51:40 -08:00
Ivan Smirnov
8660ea91b5 Fixed the dll import for relative paths + various cleanup.
- DLL import now works when __file__ is a relative path
- Various PEP8 and whitespace fixes + whitespace cleanup
- Docstring fixes (conform to numpydoc)
- Added __all__ to the module
- Fixed mutable default values
- Removed print statements
- py2/py3-compatible string-type checks
- Replace asserts with proper exceptions
- Make classes new-style (derive from object)
2015-02-16 16:03:47 +00:00
Tong He
1b92d9eadf Merge pull request #177 from pommedeterresautee/master
New co occurence computation (for importance feature function)
2015-02-15 16:48:33 -08:00
El Potaeto
f0eaac2174 Bug + documentation 2015-02-15 17:46:12 +01:00
El Potaeto
f84cc0843f fixed bug 2015-02-15 17:30:39 +01:00
El Potaeto
def2674dd1 Add new co-occurence computation capacity to importance feature function + related documentation 2015-02-15 17:15:47 +01:00
El Potaeto
d75194303b CSS improvement 2015-02-15 10:26:32 +01:00
Tong He
fe7651fe53 Merge pull request #175 from pommedeterresautee/master
markdown Vignette can be compiled as package Vignette (use devtools) + improve Vignette text
2015-02-14 14:38:45 -08:00
hetong007
3adfe4eeda not build the vignette 2015-02-13 13:13:29 -08:00
El Potaeto
3da261b6e7 add linear boosting part 2015-02-13 18:49:53 +01:00
Tianqi Chen
a718a43d92 Update mushroom.hadoop.conf 2015-02-13 09:04:05 -08:00
El Potaeto
9a4bf40e5e clean temp 2015-02-13 13:34:24 +01:00
El Potaeto
8a7d803e52 justified text in CSS 2015-02-13 13:28:04 +01:00
pommedeterresautee
ae9f7e9307 vignette text 2015-02-12 22:44:57 +01:00
pommedeterresautee
276b68b984 Vignette text 2015-02-12 22:22:00 +01:00
El Potaeto
7421f35136 vignette text 2015-02-12 20:05:38 +01:00
El Potaeto
ba36c495be text vignette 2015-02-12 17:36:10 +01:00
El Potaeto
7f71cc12f4 add bibliography 2015-02-12 17:19:11 +01:00
El Potaeto
8a8eb33114 fix temp file created by PDF 2015-02-12 15:47:53 +01:00
El Potaeto
df63c86afa git ignore update -> exclude generated vignette 2015-02-12 14:05:19 +01:00
El Potaeto
09a6522704 Vignette text 2015-02-12 13:59:45 +01:00
El Potaeto
234cf49e35 fix some CSS 2015-02-12 13:59:23 +01:00
El Potaeto
7bb2926414 add introduction paragraph from PDF file 2015-02-12 10:19:42 +01:00
El Potaeto
16ffd7c9b2 Comment wording 2015-02-12 09:56:27 +01:00
El Potaeto
f1f346713a Merge remote-tracking branch 'upstream/master' 2015-02-12 09:51:42 +01:00
Tianqi Chen
f8a314e2e4 Merge pull request #176 from tqchen/unity
pull rabit updates
2015-02-11 20:37:54 -08:00
tqchen
13776a006a Squashed 'subtree/rabit/' changes from 1bb8fe9..4db0a62
4db0a62 bugfix of lazy prepare
87017bd license
dc703e1 license
c171440 change license to bsd
7db2070 Update README.md
581fe06 add mocktest
d2f252f ok
4a5b9e5 add all
12ee049 init version of lbfgs
37a2837 complete lbfgs solver
6ade7cb complete lbfgs

git-subtree-dir: subtree/rabit
git-subtree-split: 4db0a62a06
2015-02-11 20:33:35 -08:00
tqchen
e923bdb12f Merge commit '13776a006a4e572720ec4c5b029b54771cf2b35c' into unity 2015-02-11 20:33:35 -08:00
pommedeterresautee
97cb8bf637 refactor vignette 2015-02-12 00:06:13 +01:00
tqchen
c40afa2023 fix sklearner 2015-02-11 11:37:14 -08:00
tqchen
c639efc71b Merge branch 'master' into unity 2015-02-11 10:58:19 -08:00
tqchen
2ec113b1be Merge branch 'unity'
Conflicts:
	R-package/R/predict.xgb.Booster.R
2015-02-11 10:58:09 -08:00
El Potaeto
adf8b6553d Vignettes 2015-02-11 18:01:36 +01:00
El Potaeto
d70f52d4b1 Vignette text 2015-02-11 15:25:25 +01:00
El Potaeto
e457b5ea58 Simplified my name :-) 2015-02-11 15:25:12 +01:00
El Potaeto
9d11936790 improve function documentation.
Switch xgboost detailed parameters with xgb.tain function.
2015-02-11 10:12:18 +01:00
tqchen
a16cbedfab try fix memleak when test data have more features than training 2015-02-10 21:49:29 -08:00
Tong He
292f4f0e0d Merge pull request #171 from pommedeterresautee/master
Vignette (1 updated, 1 new)
2015-02-10 16:19:54 -08:00
pommedeterresautee
dc9e4905e4 Vignette text 2015-02-10 22:48:16 +01:00
El Potaeto
d7ba5c1511 text vignette 2015-02-10 19:46:39 +01:00
El Potaeto
cefd55ef00 Vignettes improvement 2015-02-10 17:09:21 +01:00
El Potaeto
c0d8ae3781 text change 2015-02-10 13:59:13 +01:00
El Potaeto
423c3e6a8d improved vignette text 2015-02-10 13:54:30 +01:00
tqchen
a30635e0b4 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2015-02-09 21:08:07 -08:00
tqchen
e889da4cc1 new Rpack 2015-02-09 21:07:57 -08:00
hetong007
7f3dc7cf7e fix warnings 2015-02-09 18:38:23 -08:00
hetong007
25f508e43e update doc, resolve warnings 2015-02-09 17:48:52 -08:00
hetong007
47b5cf5148 fix save.raw 2015-02-09 17:35:50 -08:00
hetong007
4c25600d2a fix segfault and add two function for handle and booster 2015-02-09 17:28:48 -08:00
hetong007
0aef62dabc fix with new predict 2015-02-09 16:25:00 -08:00
hetong007
f7c838ffaa fix bugs 2015-02-09 16:16:11 -08:00
hetong007
5b611c355e add handle and raw structure to xgb.Booster 2015-02-09 15:51:24 -08:00
hetong007
ea5860d574 fix save.raw doc 2015-02-09 13:43:32 -08:00
Tong He
8c16491b42 Update xgb.save.raw.R 2015-02-09 13:31:21 -08:00
Tong He
ac3791bf74 Merge pull request #169 from pommedeterresautee/master
Fix some warnings in Cran check
2015-02-09 13:16:15 -08:00
pommedeterresautee
eecfd015fa Update CK.means version 2015-02-09 21:37:31 +01:00
pommedeterresautee
f4b454d6dd fix some warning in Cran check 2015-02-09 21:34:53 +01:00
Tianqi Chen
a3cf30592f Merge pull request #168 from pommedeterresautee/master
xgboost simplified documentation + dump function performance optimization (for big model)
2015-02-09 09:05:57 -08:00
El Potaeto
3971323203 fix bug 2015-02-09 18:01:14 +01:00
El Potaeto
0922883250 Optimization in dump function (replaced some regular R function by data.table) 2015-02-09 17:20:21 +01:00
El Potaeto
a45497e6f3 add web address 2015-02-08 22:46:59 +01:00
El Potaeto
76e24fdd36 documentation simplification 2015-02-08 22:46:29 +01:00
El Potaeto
29b5312428 remove not required dependency 2015-02-08 00:02:53 +01:00
El Potaeto
9d89441e38 small doc fix 2015-02-07 23:58:09 +01:00
El Potaeto
12b0e8e6d5 small doc fix 2015-02-07 23:57:48 +01:00
El Potaeto
75f205b0b1 fix documentation 2015-02-07 23:53:55 +01:00
El Potaeto
85739c537d new doc 2015-02-07 23:40:49 +01:00
El Potaeto
85186a2e55 remove buggy feature 2015-02-06 11:44:09 +01:00
tqchen
8b4acef662 remove sync from wrapper.h 2015-02-05 21:03:06 -08:00
El Potaeto
a82a942cd6 add importance feature sign 2015-02-05 17:25:37 +01:00
El Potaeto
68290546ca simplidied included column computation 2015-02-05 09:53:21 +01:00
El Potaeto
b7526671ba wording 2015-02-05 00:03:39 +01:00
El Potaeto
92652bffa1 wording 2015-02-05 00:01:13 +01:00
El Potaeto
9f5889f1e3 new included feature in dt.tree function 2015-02-04 23:59:53 +01:00
tqchen
b34a56b1f9 fix for ulong 2015-02-04 11:18:56 -08:00
Tong He
90c698ba13 Merge pull request #162 from pommedeterresautee/patch-1
Spell
2015-02-02 13:09:59 -08:00
Michaël Benesty
5d135858f7 Spell 2015-02-02 13:21:13 +01:00
tqchen
1d21ff87ff add saveload to raw 2015-02-01 21:19:24 -08:00
tqchen
dc3003cefd add saveload to raw 2015-02-01 21:17:37 -08:00
Tong He
6e91846c55 Merge pull request #155 from pommedeterresautee/master
fix mermaid + change in description + new plot importance feature function + fix bug in CV function + add 1 Vignette
2015-02-01 14:12:43 -08:00
El Potaeto
451944c52b CSS 2015-02-01 16:13:18 +01:00
El Potaeto
b31cbdb0a4 modif CSS 2015-02-01 16:13:13 +01:00
pommedeterresautee
a17e29b130 Fix bug in Cross Validation when showsd = FALSE 2015-02-01 14:08:48 +01:00
pommedeterresautee
9f5929497a version stringr 2015-02-01 13:09:27 +01:00
pommedeterresautee
f35950dc46 small change in package version 2015-02-01 13:02:33 +01:00
tqchen
02e98e0534 chg back to g++ 2015-01-30 21:47:49 -08:00
tqchen
3791ae5cf0 Squashed 'subtree/rabit/' changes from fb13cab..1bb8fe9
1bb8fe9 chg makefile

git-subtree-dir: subtree/rabit
git-subtree-split: 1bb8fe9615
2015-01-30 16:50:27 -08:00
tqchen
8b2dbbb782 Merge commit '3791ae5cf0a03aa64c763692cb4a5865816f37b6' 2015-01-30 16:50:27 -08:00
tqchen
b32d4faa82 quick fix seed 2015-01-30 16:50:10 -08:00
tqchen
9725cf2aeb Squashed 'subtree/rabit/' changes from 4ebe657..fb13cab
fb13cab change makefile
1479e37 fixed small bug in mpi submission script
0ca7a63 Update README.md
5ef4830 ok
93a1338 chg note

git-subtree-dir: subtree/rabit
git-subtree-split: fb13cab216
2015-01-30 16:41:06 -08:00
tqchen
25957bb1d4 Merge commit '9725cf2aeb26d5366ab659a59334b601b980f90b' 2015-01-30 16:41:06 -08:00
tqchen
42a4da91b5 chges 2015-01-30 16:40:58 -08:00
Tong He
964c668d44 Update DESCRIPTION 2015-01-29 16:20:13 -08:00
Tong He
f3b2c74153 Update README.md 2015-01-29 15:30:46 -08:00
Tong He
d788bf9aeb Update DESCRIPTION 2015-01-29 15:27:29 -08:00
Tong He
4d79ed9bb1 Update runall.R 2015-01-29 13:30:47 -08:00
El Potaeto
7ec17038f0 improve text of the Vignette 2015-01-29 10:30:50 +01:00
El Potaeto
f71aa2874c Vignette, 1st version 2015-01-28 21:43:18 +01:00
El Potaeto
170dcc49be doc 2015-01-28 21:42:58 +01:00
El Potaeto
e35a9f4822 Merge remote-tracking branch 'upstream/master' 2015-01-28 10:13:58 +01:00
tqchen
16db3ce620 quick fix 2015-01-27 16:31:53 -08:00
tqchen
3e0fba392d fix the integer overflow 2015-01-27 16:29:52 -08:00
pommedeterresautee
d6ef74386d ... 2015-01-27 22:36:35 +01:00
pommedeterresautee
5687af9774 fix error message during check 2015-01-27 22:29:29 +01:00
pommedeterresautee
e06c1da842 new plot feature importance function 2015-01-27 22:26:57 +01:00
tqchen
deb4983273 ok 2015-01-26 10:40:04 -08:00
tqchen
a264bc3969 ok 2015-01-26 10:30:12 -08:00
tqchen
e72174f0f8 add readme 2015-01-26 10:29:34 -08:00
tqchen
1f6b8eb344 Merge branch 'master' of ssh://github.com/tqchen/xgboost
Conflicts:
	.gitignore
2015-01-26 10:28:20 -08:00
tqchen
c34367b207 add msd 2015-01-26 10:27:44 -08:00
Tianqi Chen
97e058dbd7 Update README.md 2015-01-26 09:04:55 -08:00
Tianqi Chen
4266827105 Update README.md 2015-01-26 09:04:34 -08:00
El Potaeto
15dee73795 change in Description 2015-01-26 00:00:14 +01:00
hetong007
5188bad873 fix cv attr 2015-01-25 14:16:46 -08:00
El Potaeto
5e94126963 fix mermaid 2015-01-25 21:07:06 +01:00
El Potaeto
52a2b652d3 Documentation: no need to save model in txt... 2015-01-25 20:16:56 +01:00
hetong
f75387f701 update document 2015-01-25 10:37:11 -08:00
hetong
33101d5cad edit document 2015-01-25 10:31:48 -08:00
Tong He
8971f0ff50 Update xgboost.R 2015-01-25 10:21:24 -08:00
tqchen
f848844310 better warning at multiclass, fix cran check 2015-01-25 10:05:47 -08:00
Tong He
da9f0989c6 Merge pull request #152 from pommedeterresautee/master
Fix global variable message (Cran Checks)
2015-01-22 10:26:15 -08:00
El Potaeto
d188c997f0 add RStudio parameters to exclusion 2015-01-21 23:56:27 +01:00
pommedeterresautee
7f1aff7858 forget one variable 2015-01-21 22:07:30 +01:00
El Potaeto
f1d9fe8153 fix a bug introduced in previous commit 2015-01-21 13:31:17 +01:00
El Potaeto
e475b7d84e Avoid some Cran check error messages 2015-01-21 13:26:34 +01:00
hetong
34e2fbd2c4 fix some issues from the cran check 2015-01-20 21:29:51 -08:00
tqchen
417ac4a631 rm socket from source 2015-01-20 17:15:54 -08:00
hetong007
42110f3d70 documentation update 2015-01-20 16:24:01 -08:00
hetong007
d87cb24793 documentation update 2015-01-20 16:21:13 -08:00
hetong007
6901e90730 resolving not-CRAN issues 2015-01-20 15:51:42 -08:00
hetong007
eb01acfad8 improve demo of cv in R 2015-01-20 14:35:44 -08:00
hetong007
947f0a926d enable returning prediction in cv 2015-01-20 14:12:45 -08:00
tqchen
6937384e62 Squashed 'subtree/rabit/' changes from 85b7463..4ebe657
4ebe657 fix in cxx11

git-subtree-dir: subtree/rabit
git-subtree-split: 4ebe657dd7
2015-01-19 21:37:23 -08:00
tqchen
89d5e67b78 Merge commit '6937384e625dd44b181d0216fde6234be1b7c874' 2015-01-19 21:37:23 -08:00
tqchen
cd2bce4719 update with new rabit api 2015-01-19 21:32:25 -08:00
tqchen
ea50f8e030 Squashed 'subtree/rabit/' changes from 1db6449..85b7463
85b7463 change def of reducer to take function ptr
fe6366e add engine base
a98720e more deps

git-subtree-dir: subtree/rabit
git-subtree-split: 85b746394e
2015-01-19 21:26:25 -08:00
tqchen
25cf27d50f Merge commit 'ea50f8e030111f659dd69b89c86eba51abd39eba' 2015-01-19 21:26:25 -08:00
hetong
3b190123c8 update demo readme 2015-01-19 19:29:24 -08:00
hetong
c0c6951b73 fix bug in format of input 2015-01-19 19:26:25 -08:00
hetong
f295177b1d add nrow to getinfo 2015-01-19 13:36:53 -08:00
hetong
a1e188aa75 add nrow to getinfo 2015-01-19 13:35:11 -08:00
hetong
43c13d82ba add leaf example in R 2015-01-19 10:34:14 -08:00
tqchen
312546b99d quick fix 2015-01-19 10:00:28 -08:00
tqchen
7c6cf4bad8 quick fix 2015-01-19 09:59:33 -08:00
Tianqi Chen
1ea23d3390 Merge pull request #149 from tqchen/unity
add proptype of predleaf in R, fix bug in lambda rank
2015-01-19 09:08:19 -08:00
tqchen
632fdbbf5c add proptype of predleaf in R, fix bug in lambda rank 2015-01-19 09:07:37 -08:00
Tianqi Chen
9b3a601ede Merge pull request #148 from tqchen/unity
Distributed XGBoost from unity
2015-01-19 08:45:07 -08:00
tqchen
b9650f19c1 change tracker dir 2015-01-19 08:41:14 -08:00
tqchen
c1f84ba446 add note to subtree 2015-01-19 08:39:26 -08:00
tqchen
902f84cf4a ok 2015-01-19 08:37:17 -08:00
tqchen
9ea6b2f1b8 minor fix 2015-01-19 08:36:19 -08:00
tqchen
f0a412d224 update note 2015-01-19 08:34:35 -08:00
Tianqi Chen
e5c609271f add rabit to xgb 2015-01-19 08:16:54 -08:00
tqchen
ccba73e5d5 remove xgpred 2015-01-19 08:07:50 -08:00
tqchen
1211ea40c9 add single instance prediction 2015-01-19 08:07:22 -08:00
Tianqi Chen
748389f052 fix win compile 2015-01-19 00:29:03 -08:00
Tianqi Chen
8e8926550f fix of Rpack 2015-01-19 00:01:17 -08:00
tqchen
0b55fa6aff Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2015-01-18 22:56:33 -08:00
tqchen
631b092b25 changes 2015-01-18 22:56:29 -08:00
Tianqi Chen
f22ee7cb61 windows changes 2015-01-18 22:54:01 -08:00
tqchen
7780bc45c2 change R build script 2015-01-18 22:14:38 -08:00
tqchen
81749e6b63 Squashed 'subtree/rabit/' changes from c7282ac..1db6449
1db6449 remove include in -I, make things easier to direct compile

git-subtree-dir: subtree/rabit
git-subtree-split: 1db6449b01
2015-01-18 21:31:16 -08:00
tqchen
c51e01da2f Merge commit '81749e6b637997156c481e7f1d74fd319ba7b1d4' into unity 2015-01-18 21:31:16 -08:00
tqchen
ba0b950a84 add sync module 2015-01-18 21:31:09 -08:00
tqchen
d87691ec60 Squashed 'subtree/rabit/' content from commit c7282ac
git-subtree-dir: subtree/rabit
git-subtree-split: c7282acb2a
2015-01-18 21:08:17 -08:00
tqchen
152e08974d Merge commit 'd87691ec603db325d5b1c5db1186295a748df7cc' as 'subtree/rabit' 2015-01-18 21:08:17 -08:00
tqchen
07da390575 add subtree folder 2015-01-18 21:07:31 -08:00
tqchen
9695c51ce1 Merge branch 'master' into unity 2015-01-18 20:09:36 -08:00
tqchen
f49fd88de8 Merge branch 'unity'
Conflicts:
	.gitignore
	R-package/src/xgboost_R.cpp
	src/gbm/gblinear-inl.hpp
	tools/xgcombine_buffer.cpp
2015-01-18 20:09:21 -08:00
Tianqi Chen
d50079f993 Merge pull request #145 from pommedeterresautee/master
refactoring
2015-01-18 14:57:44 -08:00
El Potaeto
d84d27ae3d refactoring 2015-01-18 00:35:38 +01:00
tqchen
b898672753 ok 2015-01-15 22:03:32 -08:00
tqchen
90ec783e65 remove build 2015-01-15 22:01:55 -08:00
tqchen
4715672d76 chg 2015-01-15 22:01:29 -08:00
tqchen
b1df8039a0 ignore 2015-01-15 21:56:39 -08:00
tqchen
b1f89f29b8 cleanup multi-node 2015-01-15 21:55:56 -08:00
tqchen
b762231b02 change makefile to lazy checkpt, fix col splt code 2015-01-15 21:32:31 -08:00
Tianqi Chen
962c2432a0 Merge pull request #143 from cblsjtu/unity
modify doc
2015-01-14 10:07:33 -08:00
Boliang Chen
4d30fa2449 Merge branch 'unity' of github.com:tqchen/xgboost into unity
Conflicts:
	multi-node/hadoop/README.md
2015-01-14 22:36:39 +08:00
Boliang Chen
ede1222b02 modify doc 2015-01-14 22:15:31 +08:00
Tong He
bbbc6be58e Add vcd to the dependencies 2015-01-13 15:38:50 -08:00
tqchen
a53f0cd9bf doc chg 2015-01-12 11:55:42 -08:00
tqchen
9346c328cb chg 2015-01-12 11:53:40 -08:00
tqchen
2a9a864b11 ok 2015-01-12 11:50:18 -08:00
tqchen
6b7f20c002 chgs 2015-01-12 11:49:42 -08:00
tqchen
5e0e8a5ff7 changes 2015-01-12 11:47:46 -08:00
tqchen
083c032319 Merge branch 'cblsjtu-unity' into unity 2015-01-12 11:41:59 -08:00
tqchen
48a44b24f9 Merge branch 'unity' of https://github.com/cblsjtu/xgboost into cblsjtu-unity
Conflicts:
	multi-node/hadoop/README.md
	multi-node/hadoop/mushroom.hadoop.conf
	multi-node/hadoop/run_hadoop_mushroom.sh
2015-01-12 11:41:07 -08:00
Tianqi Chen
d57cb4f17b Update mushroom.hadoop.conf 2015-01-12 09:02:53 -08:00
tqchen
62a108a7c2 chg of hadoop script 2015-01-11 21:02:38 -08:00
Tianqi Chen
166e7525da Merge pull request #142 from pommedeterresautee/master
avoid warning message when a tree is just made of one leaf
2015-01-11 16:02:56 -08:00
El Potaeto
48c1911bc4 fix error 2015-01-11 23:39:24 +01:00
El Potaeto
d441a9d382 avoid error when a tree is just made of one leaf 2015-01-11 23:37:02 +01:00
Tianqi Chen
9a2ad91b48 Merge pull request #138 from pommedeterresautee/master
new parameters, refactoring...
2015-01-11 14:27:38 -08:00
Tianqi Chen
15bf8677da Merge pull request #140 from EricChenDM/unity
yarn script
2015-01-11 10:40:04 -08:00
chenshuaihua
0111a14aef yarn script 2015-01-11 23:57:52 +08:00
Boliang Chen
df3f87c182 add more details 2015-01-11 18:20:16 +08:00
Boliang Chen
fdbca6013d modify 2015-01-11 17:57:41 +08:00
El Potaeto
31a3b38ef8 add new parameters model to avoid the use of dump file for functions plot, dt.tree, importance
add new size parameter for plot function
2015-01-11 09:40:55 +01:00
Boliang Chen
ef2518364c change to minimal setting 2015-01-11 16:07:00 +08:00
Boliang Chen
525c1594e5 revise the script 2015-01-11 16:06:19 +08:00
Tianqi Chen
c38f7109bd Merge pull request #137 from cblsjtu/unity
Unity hadoop version scripts
2015-01-10 23:47:52 -08:00
tqchen
69e079941e allow pred to stdout 2015-01-10 23:46:29 -08:00
Boliang Chen
ceabf5755f hadoop version conf 2015-01-11 15:44:16 +08:00
Boliang Chen
fb65356dd4 change file name 2015-01-11 15:41:46 +08:00
Boliang Chen
2f95968a1c ok 2015-01-11 15:34:55 +08:00
Boliang Chen
966416e69c Merge remote-tracking branch 'tqchen/unity' into unity 2015-01-11 13:48:29 +08:00
tqchen
db4637b085 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2015-01-10 21:33:16 -08:00
tqchen
9eaf073e3c change default distributed mode to row 2015-01-10 21:33:07 -08:00
Boliang Chen
d5e9b1d4ea delete hadoop conf 2015-01-11 13:08:52 +08:00
El Potaeto
c8c5789efd add new parameters to several functions avoid the need of a text dump 2015-01-11 03:06:41 +01:00
El Potaeto
70df227689 dump function is now memory safe 2015-01-11 01:04:54 +01:00
Tianqi Chen
d348f83c17 Merge pull request #136 from cblsjtu/unity
hadoop example
2015-01-10 09:31:06 -08:00
Boliang Chen
7665dd1ed2 rename 2015-01-11 00:04:47 +08:00
Boliang Chen
74348c8001 initialize 2015-01-11 00:00:03 +08:00
Boliang Chen
24f99220cb fix bugs 2015-01-10 23:59:25 +08:00
Boliang Chen
61a43111a7 hadoop version of xgboost binary classification script 2015-01-10 12:30:00 +08:00
Boliang Chen
e20d4f4387 comment some parameters not supported by hadoop version of xgboost 2015-01-10 12:26:43 +08:00
Tianqi Chen
72f6fbd46f Merge pull request #135 from pommedeterresautee/master
fix a small bug in CV function
2015-01-09 10:06:22 -08:00
El Potaeto
359889e3d6 fix a small bug in CV function 2015-01-09 19:03:47 +01:00
Tianqi Chen
75a75bc1e9 Merge pull request #134 from pommedeterresautee/master
nice work! merged to master.
2015-01-09 09:46:53 -08:00
El Potaeto
99b4ead937 add new dependency on DiagrammeR 2015-01-09 18:28:10 +01:00
El Potaeto
a3493934d1 documentation example change 2015-01-09 18:26:56 +01:00
El Potaeto
51935851bd fix plenty of small bugs 2015-01-09 18:24:12 +01:00
El Potaeto
b656ca1554 reindent 2015-01-09 11:54:23 +01:00
El Potaeto
d96bd15b7d small fix in the C dump code 2015-01-09 11:52:40 +01:00
El Potaeto
31d0e8f65d better doc of dump function 2015-01-09 11:14:08 +01:00
El Potaeto
9d6eecf34e small change in import lib 2015-01-09 11:07:53 +01:00
El Potaeto
10f755e055 only replace tabulation which begins a line (avoid wrong replacement in feature name) 2015-01-09 11:06:56 +01:00
El Potaeto
3e1eea0eea refactor dump function to adapt to the new possibilities of exporting a String 2015-01-09 00:14:01 +01:00
El Potaeto
6fd8bbe71a C part export a model dump string 2015-01-08 23:47:00 +01:00
El Potaeto
3d0bbae2c2 refactoring of importance function 2015-01-07 18:18:52 +01:00
El Potaeto
d532f04394 add new function to read model and use it in the plot function 2015-01-07 17:47:50 +01:00
El Potaeto
e380e4facf refactoring for perf 2015-01-07 17:09:56 +01:00
El Potaeto
cce26756bf add style option 2015-01-07 17:05:34 +01:00
pommedeterresautee
9e20893d35 Change in aesthetic
Improve documentation
2015-01-06 23:57:33 +01:00
El Potaeto
3dd202a19e Add stat indicators in plot 2015-01-06 18:18:55 +01:00
El Potaeto
94d070da60 add limit number of trees option 2015-01-06 13:59:29 +01:00
El Potaeto
a6c588f90d fix arg check 2015-01-06 13:59:14 +01:00
Boliang Chen
f82732a362 add hadoop folder 2015-01-06 17:09:15 +08:00
El Potaeto
c64bfad5bb fix import issue 2015-01-05 19:35:33 +01:00
El Potaeto
59412f64ad Merge remote-tracking branch 'upstream/master' 2015-01-05 19:30:29 +01:00
El Potaeto
f793df671b Change code to look like a function 2015-01-05 19:26:26 +01:00
El Potaeto
3d068b4e1a new documentation
new import
2015-01-05 19:26:09 +01:00
El Potaeto
b9799c6ac4 refactor plot function 2015-01-04 22:42:17 +01:00
El Potaeto
ffbd78fce4 use style CSS class instead of q style per item 2015-01-04 22:40:31 +01:00
El Potaeto
f6290ad792 plot all trees 2015-01-04 21:56:41 +01:00
El Potaeto
33bb168574 basis to plot 2015-01-04 17:23:53 +01:00
tqchen
2925236fab change dump stats 2015-01-04 02:35:24 -08:00
El Potaeto
8b45ef07ca build data.table from raw model data 2015-01-04 11:21:39 +01:00
El Potaeto
cfe5015e54 small fix in parsing 2015-01-04 11:21:03 +01:00
El Potaeto
cdea1685e5 Add a new verbose parameter to print progress during the process (set to true by default to not change behavior of existing code) + source code refactoring 2015-01-02 11:21:53 +01:00
Tianqi Chen
61df646eed Merge pull request #132 from pommedeterresautee/master
Return history as data.table for cross validation + bring back linear model dump to master + other fixes
2015-01-02 17:06:24 +08:00
El Potaeto
4d0d65837d parse history first line to guess which columns are required 2015-01-01 22:43:23 +01:00
El Potaeto
8bbe45eed2 fix some missing imports 2015-01-01 16:09:03 +01:00
El Potaeto
a524a51a06 return history as data.table for cross validation + documentation 2015-01-01 16:05:43 +01:00
El Potaeto
34aaeff3d9 small documentation change 2015-01-01 14:57:48 +01:00
El Potaeto
5e5500d6d3 rewording 2015-01-01 13:50:28 +01:00
El Potaeto
901904b535 linear text dump model 2015-01-01 13:50:05 +01:00
Tianqi Chen
3974231440 Merge pull request #130 from pommedeterresautee/master
Improve demo text (more explanation)
2014-12-31 18:32:13 +08:00
El Potaeto
d07be2bb96 Username parameter is deprecated in install_function (see doc of the package for more information). 2014-12-31 11:03:51 +01:00
El Potaeto
4f0ae53974 text change 2014-12-31 10:49:05 +01:00
El Potaeto
9998575c32 Small text improvement 2014-12-31 10:47:57 +01:00
El Potaeto
4cc3790b76 Improve explanation, add new concepts. 2014-12-31 10:36:10 +01:00
Tianqi Chen
4183c239ca Merge pull request #128 from mhue/master
Fixed minor typos.
2014-12-31 09:04:30 +08:00
El Potaeto
c3d8f21df3 change assignation sign 2014-12-31 00:52:53 +01:00
Bing Xu
9267e3b368 Merge pull request #129 from pommedeterresautee/master
Add demo code
2014-12-30 16:51:11 -07:00
El Potaeto
006578e2e6 fix demo index 2014-12-31 00:46:12 +01:00
El Potaeto
97fd9b47d4 Add new demo 2014-12-31 00:39:13 +01:00
Martial Hue
79731f48b6 Fixed minor typos. 2014-12-30 17:50:24 +01:00
El Potaeto
7558a94507 Update wlkthrough R demo code to include variable importance. 2014-12-30 16:38:56 +01:00
El Potaeto
8e74bcdd05 remove unneeded text... 2014-12-30 16:29:13 +01:00
El Potaeto
2364e914bd Documentation regenerated with fixes 2014-12-30 16:24:16 +01:00
El Potaeto
e64cb99f89 Missing parameter documentation
Fix data documentation
2014-12-30 16:22:50 +01:00
El Potaeto
af31397ec2 Missing parameter documentation 2014-12-30 16:22:24 +01:00
El Potaeto
31ed2813bd Spell 2014-12-30 16:05:12 +01:00
El Potaeto
45a006f367 R demo code README 2014-12-30 16:04:43 +01:00
El Potaeto
345b93fcfa fix link 2014-12-30 15:03:21 +01:00
El Potaeto
d8eb978f98 Update readme with new win on Kaggle 2014-12-30 15:00:52 +01:00
cblsjtu
01f640f8a6 Merge pull request #1 from tqchen/unity
Unity
2014-12-30 20:26:12 +08:00
Tianqi Chen
39bb719063 Merge pull request #125 from pommedeterresautee/master
Take gain into account for feature importance
2014-12-30 19:50:19 +08:00
El Potaeto
c6f76fab56 add new Gain and Weight columns.
documentation updated.
2014-12-30 12:32:58 +01:00
El Potaeto
c754fd4ad0 documentation wording 2014-12-30 12:32:21 +01:00
El Potaeto
3694772bde Add a new Weight and Gain column.
Update documentation.
2014-12-30 12:16:13 +01:00
tqchen
5ad100b5a3 now support distributed evaluation 2014-12-29 19:24:08 -08:00
tqchen
c395c5bed3 update build script 2014-12-29 17:41:47 -08:00
El Potaeto
78813d8f78 wording 2014-12-30 00:12:01 +01:00
El Potaeto
263f7fa69d Take gain into account to discover most important variables 2014-12-29 23:57:41 +01:00
El Potaeto
dba1ce7050 new dependency over stringr 2014-12-29 23:57:01 +01:00
El Potaeto
9b6a14a99d regeneration of documentation 2014-12-29 23:56:31 +01:00
El Potaeto
755be4b846 Add variable type checks 2014-12-29 10:31:17 +01:00
tqchen
6b96737811 add dump statistics 2014-12-28 17:45:37 -08:00
Tianqi Chen
0c7e090c19 Merge pull request #124 from pommedeterresautee/master
Add a new function to see importance of features in a model
2014-12-28 20:06:55 +08:00
El Potaeto
99af2c8ffd Documentation of the function 2014-12-28 11:33:14 +01:00
El Potaeto
84fb89af70 fix small bug introduced in refactoring 2014-12-28 11:30:55 +01:00
El Potaeto
2154a160a3 refactoring of validation to improve source code readability. 2014-12-28 11:18:26 +01:00
El Potaeto
151285300b change version number + date 2014-12-28 11:02:48 +01:00
El Potaeto
46862e561b Add .gitignore 2014-12-28 10:47:02 +01:00
El Potaeto
ce83611a72 generated documentation with ROxygen2 2014-12-28 10:46:31 +01:00
El Potaeto
e63c79d6c6 new function cv.importance + documentation 2014-12-28 10:45:47 +01:00
El Potaeto
8c17a86b38 Update Namespace with new function 2014-12-28 10:24:43 +01:00
El Potaeto
1d64cd8896 Add new dependency 2014-12-28 10:24:08 +01:00
El Potaeto
4369a57270 fix data labels 2014-12-28 09:56:55 +01:00
tqchen
c8f422b3b9 add dump to linear model 2014-12-24 02:56:32 -08:00
tqchen
6d7ef172ef add base64 model format 2014-12-24 02:33:50 -08:00
tqchen
c8396ca24e add mock exec 2014-12-21 18:47:56 -08:00
tqchen
677475529f fix the row split recovery, add per iteration random number seed 2014-12-21 17:31:42 -08:00
tqchen
eff5c6baa8 push in row mock file 2014-12-21 04:36:18 -08:00
tqchen
d603852828 rm boost str 2014-12-21 00:17:27 -08:00
tqchen
31eedfea59 pas mock, need to fix rabit lib for not initialization 2014-12-21 00:14:00 -08:00
tqchen
b078663982 ok 2014-12-20 16:39:39 -08:00
tqchen
7a35e1a906 change hist update to lazy 2014-12-20 05:02:38 -08:00
tqchen
deb21351b9 add rabit checkpoint to xgb 2014-12-20 01:05:40 -08:00
tqchen
8e16cc4617 change allreduce lib to rabit library, xgboost now run with rabit 2014-12-20 00:17:09 -08:00
Tianqi Chen
646f33d01d Update README.md 2014-12-12 05:47:00 -08:00
Tianqi Chen
a50fd27fd3 Update README.md 2014-12-12 05:46:32 -08:00
Tianqi Chen
5ae99372d6 Update simple_dmatrix-inl.hpp 2014-11-26 09:13:49 -08:00
Tianqi Chen
be5fb800d5 Merge pull request #112 from tfgit/master
Fixed README
2014-11-25 19:29:41 -08:00
Ted Fujimoto
baf41d589d Fixed README 2014-11-25 22:17:36 -05:00
Tianqi Chen
8d7dbc65b3 Merge pull request #111 from tfgit/master
OS X OpenMP support instructions
2014-11-25 19:12:42 -08:00
Ted Fujimoto
198489438f Added OS X OpenMP instructions 2014-11-25 21:42:13 -05:00
Ted Fujimoto
c356a0acc2 Remove tools folder 2014-11-25 21:27:50 -05:00
Tianqi Chen
cdcfa5687a Update socket.h 2014-11-23 22:46:57 -08:00
tqchen
f53be2884a ok 2014-11-23 22:42:44 -08:00
Tianqi Chen
f805ecb5f3 fix a bug in node sindex set 2014-11-23 22:35:30 -08:00
tqchen
3e162ceda6 windows strange 2014-11-23 22:21:15 -08:00
tqchen
35bf2101fe seems a prob in win 2014-11-23 22:18:28 -08:00
Tianqi Chen
fde580b08e fix windows run 2014-11-23 22:12:55 -08:00
tqchen
77ffd0465b ok 2014-11-23 21:36:22 -08:00
tqchen
78ca72b9c7 start work on win 2014-11-23 21:34:15 -08:00
tqchen
d2f151ef5a bring it back alive again 2014-11-23 21:27:16 -08:00
Tianqi Chen
7f3dc967cf changes in socket, a bit work in linux side first 2014-11-23 21:21:52 -08:00
tqchen
db2adb6885 start check windows compatiblity 2014-11-23 20:59:10 -08:00
Tianqi Chen
2e444f8338 remove warning from MSVC need another round of check 2014-11-23 20:52:13 -08:00
tqchen
b55fe80350 add row map example 2014-11-23 18:15:42 -08:00
tqchen
372de9f968 check in conf 2014-11-23 17:35:21 -08:00
tqchen
373620503a ok 2014-11-23 14:08:34 -08:00
tqchen
5f08313cb2 make wrapper ok 2014-11-23 14:03:59 -08:00
tqchen
69b2f31098 bugfix in allreduce 2014-11-23 11:31:34 -08:00
tqchen
115424826b basic test pass 2014-11-23 11:15:48 -08:00
tqchen
c499dd0f0c start testing allreduce 2014-11-22 22:55:43 -08:00
tqchen
cb1c34aef0 add nonblocking mode 2014-11-22 17:15:05 -08:00
tqchen
67c5d8a2e6 allreduce server side ok, need to add master 2014-11-22 17:12:19 -08:00
tqchen
4864220702 have the function, ready, need initializer 2014-11-22 12:15:30 -08:00
tqchen
7ec3fc936a check in allreduce tcp, check if there could be more concise form 2014-11-21 22:54:11 -08:00
tqchen
b6e1b19205 checkin socket module 2014-11-21 16:09:28 -08:00
tqchen
84dcab6795 checkin socket module 2014-11-21 16:09:26 -08:00
Tianqi Chen
c29a600d46 Update README.md 2014-11-21 09:48:59 -08:00
tqchen
168bb0d0c9 add predict leaf indices 2014-11-21 09:32:09 -08:00
Tianqi Chen
6ed82edad7 Merge pull request #106 from tqchen/master
pull master into unity
2014-11-21 08:56:01 -08:00
Tianqi Chen
d4103ea7ea Update README.md 2014-11-20 22:01:26 -08:00
Tong He
c16e0f6809 Update predict.xgb.Booster.R
add parameter missing
2014-11-20 15:19:53 -08:00
Tong He
98ee7e8057 Update xgboost.R
add parameter missing
2014-11-20 15:14:05 -08:00
Tong He
20817b56f3 Update xgb.cv.R
add parameter missing
2014-11-20 15:14:00 -08:00
Tong He
bbd7098e51 Update utils.R
add parameter missing
2014-11-20 15:13:28 -08:00
tqchen
ed87eb61bd allow nan as mssing 2014-11-20 13:14:04 -08:00
tqchen
23fbf079b9 fix bug in row 2014-11-20 12:56:30 -08:00
tqchen
974202eb55 check pipe, commit optimization for hist 2014-11-20 11:22:09 -08:00
tqchen
6b674b491f Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-11-19 20:09:38 -08:00
tqchen
9af464303a checkin row continue training 2014-11-19 20:09:26 -08:00
Tianqi Chen
b595854e8c Update README.md 2014-11-19 20:08:11 -08:00
tqchen
970dd58dc2 checkin continue training 2014-11-19 20:06:08 -08:00
tqchen
26e5eae6f2 ok 2014-11-19 19:27:04 -08:00
tqchen
41eac089c8 chg 2014-11-19 19:25:49 -08:00
tqchen
338117867b small change 2014-11-19 19:24:20 -08:00
tqchen
a0342cb196 small change 2014-11-19 19:22:36 -08:00
tqchen
3b48a9f359 checkin split row 2014-11-19 19:21:56 -08:00
tqchen
c42ba8d281 get multinode in 2014-11-19 19:19:53 -08:00
tqchen
7c3a392136 compile 2014-11-19 15:28:09 -08:00
tqchen
55e62a7120 still need to test row merge 2014-11-19 11:44:24 -08:00
tqchen
da54f5e5d8 add note for col 2014-11-19 11:37:54 -08:00
tqchen
03e24cf590 check multinode 2014-11-19 11:22:17 -08:00
tqchen
54e2ed90d7 recheck column mode 2014-11-19 11:21:07 -08:00
tqchen
dffcbc838b Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity
Conflicts:
	src/tree/updater_histmaker-inl.hpp
2014-11-19 09:55:05 -08:00
tqchen
fa1581b94c cqmaker ok 2014-11-19 09:51:30 -08:00
tqchen
32beb56ba3 only need to add in create hist col base 2014-11-18 22:21:41 -08:00
tqchen
08e9813c9b potential BUG in skmaker? 2014-11-18 21:23:36 -08:00
tqchen
1b66a87456 checkin skmaker 2014-11-18 20:57:28 -08:00
tqchen
303f8b9bc5 hack to make the propose fast in one pass, start sketchmaker 2014-11-18 11:25:54 -08:00
tqchen
ce7ecadf5e simplify 2014-11-18 10:52:18 -08:00
tqchen
5de0a2cdc0 sorted base sketch maker 2014-11-18 10:19:18 -08:00
tqchen
5e8e9a9b74 updated base 2014-11-17 10:49:53 -08:00
tqchen
8874234e5e check in basemaker 2014-11-16 22:23:33 -08:00
tqchen
d11445e0b1 add in sync 2014-11-16 22:01:22 -08:00
tqchen
8ed585a7a2 check in two bad ones, start think of column distribut cut row 2014-11-16 13:31:50 -08:00
tqchen
5061d55725 alrite 2014-11-16 11:47:21 -08:00
tqchen
129fee64f3 fix regression 2014-11-16 11:38:21 -08:00
tqchen
02c2278f96 ok 2014-11-15 21:18:15 -08:00
tqchen
daa28f238e fix compile, need final leaf node? 2014-11-15 21:02:19 -08:00
tqchen
c86b83ea04 a version that compile 2014-11-15 17:41:03 -08:00
tqchen
c1f1bb9206 first ver 2014-11-15 09:46:30 -08:00
tqchen
076159cf7a remove cstdio 2014-11-14 14:37:13 -08:00
Tianqi Chen
b66bcb7974 Merge pull request #100 from travisbrady/master
add ifdef __cplusplus to wrapper header file
2014-11-14 14:33:49 -08:00
Travis Brady
42712988af add ifdef __cplusplus to wrapper header file 2014-11-14 15:48:13 -06:00
tqchen
698c010247 add example 2014-11-10 22:09:01 -08:00
tqchen
e7ea87b5fd ok for now 2014-11-10 22:03:42 -08:00
tqchen
9d101b47f9 optimize heavy hitter 2014-11-10 21:18:37 -08:00
tqchen
b426eef527 chg begin end type 2014-11-10 17:24:44 -08:00
tqchen
9855a90142 unified gk and wq 2014-11-10 17:06:10 -08:00
tqchen
7b8ba268dc commit in quantile test 2014-11-10 16:44:07 -08:00
tqchen
d4c4ee0b01 mostly correct\n 2014-11-09 23:34:45 -08:00
tqchen
69874dc571 init check 2014-11-09 21:56:56 -08:00
tqchen
5561dd9cb0 fix bug in queue2summary 2014-11-09 21:09:07 -08:00
tqchen
7c1ec78a01 before test quantile 2014-11-09 18:03:36 -08:00
tqchen
0e6b899d07 quantile 2014-11-09 16:02:38 -08:00
tqchen
aace84c349 pass group data test 2014-11-06 15:58:36 -08:00
tqchen
539fce2856 ok 2014-11-06 15:37:23 -08:00
tqchen
ca96468745 everything is ready, except for propose 2014-11-02 21:52:59 -08:00
Tianqi Chen
b2850ae0f9 Update README.md 2014-10-23 09:43:03 -07:00
Tianqi Chen
c17c0f3197 Update README.md 2014-10-23 09:41:12 -07:00
tqchen
96c5196647 remv debug 2014-10-20 18:06:15 -07:00
tqchen
23eaa7ed32 add quantile sketch 2014-10-20 18:04:39 -07:00
tqchen
dcd0dd5e26 finish find split, next to do quantile sketch 2014-10-18 10:24:29 -07:00
tqchen
a7bc769971 incomplete histmaker 2014-10-17 17:55:07 -07:00
tqchen
c2fa390181 move sync tree to pruner, pruner is now distributed 2014-10-17 14:53:43 -07:00
tqchen
a68ac8033e refresher is now distributed 2014-10-17 14:48:32 -07:00
tqchen
9df9e07f9b minor change in main 2014-10-17 14:11:46 -07:00
tqchen
f6d61f02f6 fix load bug 2014-10-16 21:47:01 -07:00
tqchen
3f3c90c3c0 add part_load col 2014-10-16 19:41:43 -07:00
tqchen
f512f08437 finish mushroom example 2014-10-16 18:06:47 -07:00
tqchen
0cf2dd39ea new change for mpi 2014-10-16 15:12:10 -07:00
tqchen
a21df0770d make clear seperation 2014-10-16 13:03:42 -07:00
tqchen
47145a7fac ok, now work on update position 2014-10-16 11:56:55 -07:00
tqchen
aefe58a207 middle version 2014-10-16 10:38:49 -07:00
tqchen
6680bffaae chg 2014-10-15 21:45:13 -07:00
tqchen
f2577fec86 intial version of sync wrapper 2014-10-15 21:39:42 -07:00
tqchen
e295128973 add bitmap . 2014-10-15 14:30:09 -07:00
tqchen
d0daecb4d3 add bitmap . 2014-10-15 14:30:06 -07:00
Tianqi Chen
f2cceb37eb Update README.md 2014-10-13 09:21:43 -07:00
tqchen
c957e1a648 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-10-01 09:20:16 -07:00
tqchen
78efa13d41 add example with additional attr 2014-10-01 09:20:06 -07:00
Tianqi Chen
d6b60a1e4a Update README.md 2014-09-18 17:53:20 -07:00
Tianqi Chen
d3f7952991 Update README.md 2014-09-18 17:52:41 -07:00
tqchen
91e34c6fb4 ok 2014-09-12 21:26:38 -07:00
tqchen
bf2426f3cd some changes 2014-09-12 17:31:06 -07:00
tqchen
3a0be47b1c add tmp file 2014-09-12 15:52:39 -07:00
tqchen
87cc53f0cd make basic combine buf 2014-09-10 21:38:50 -07:00
tqchen
fe9e89cadd Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-09-10 21:33:51 -07:00
tqchen
0e8846a42f ok 2014-09-10 13:51:34 -07:00
Tianqi Chen
496301585a Update README.md 2014-09-09 21:43:45 -07:00
Tianqi Chen
4275403004 Update README.md 2014-09-09 21:38:01 -07:00
Tianqi Chen
c380342c5f Update README.md 2014-09-09 21:35:24 -07:00
Tianqi Chen
2fec85ab8a Update README.md 2014-09-09 21:34:10 -07:00
Tianqi Chen
86bdef1f19 Update README.md 2014-09-09 21:31:40 -07:00
Tianqi Chen
9e701440e7 Update README.md 2014-09-09 21:28:58 -07:00
Tianqi Chen
1a6af1aacf Update README.md 2014-09-09 21:28:19 -07:00
Tianqi Chen
011df2993a Update README.md 2014-09-09 21:27:01 -07:00
tqchen
7d0d3f07ef Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-09-08 21:52:34 -07:00
tqchen
a3806398b9 delete old cvpack 2014-09-08 21:34:42 -07:00
tqchen
a3d5930f26 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-09-08 16:20:48 -07:00
tqchen
e90b25a381 add object bound checking 2014-09-08 16:20:41 -07:00
Tianqi Chen
4e44dd83a7 Merge pull request #72 from giuliohome/master
python 3 encoding
2014-09-08 14:49:53 -07:00
giuliohome
02e41be857 python 3 encoding 2014-09-08 23:40:04 +02:00
tqchen
d4ab359be1 fix 2014-09-07 20:01:03 -07:00
tqchen
19a1ee24a5 try predpath 2014-09-07 18:40:15 -07:00
tqchen
75aa5bd258 Merge branch 'master' into unity 2014-09-07 18:16:55 -07:00
tqchen
ae3621b372 Merge branch 'unity'
Conflicts:
	R-package/src/xgboost_R.cpp
	wrapper/xgboost.py
2014-09-07 18:16:49 -07:00
Tianqi Chen
852ce6be0b Update README.md 2014-09-07 16:48:45 -07:00
Tong He
946f3c7ac5 Update DESCRIPTION 2014-09-07 10:36:50 -07:00
tqchen
5621d9811f remove deprecate 2014-09-07 10:17:34 -07:00
hetong
9e3b878943 refine style with max.depth 2014-09-06 23:20:11 -07:00
hetong
1925321a16 remove incorrect link to old folders 2014-09-06 23:14:38 -07:00
hetong
80636cd804 improve runall.R 2014-09-06 23:06:47 -07:00
hetong
cd35d88a03 remove inst/, improve vignette 2014-09-06 23:05:21 -07:00
hetong
50d77c72eb Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-06 22:48:24 -07:00
hetong
fbecd163c5 replace iris in docs 2014-09-06 22:48:08 -07:00
tqchen
89b9965cbf change max depth 2014-09-06 22:40:51 -07:00
Tianqi Chen
32a2925be8 Update build.sh 2014-09-06 22:27:25 -07:00
Tianqi Chen
2d2cee879d Update build.sh 2014-09-06 22:26:35 -07:00
tqchen
17ebdde707 chg back to g++ 2014-09-06 22:21:50 -07:00
tqchen
014e830a04 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-09-06 22:20:18 -07:00
tqchen
a7a0b34a54 add auto build script 2014-09-06 22:20:11 -07:00
hetong
ddf715953a forced add doc for test 2014-09-06 22:03:07 -07:00
hetong
d174a79fbd add doc for agaricus.test 2014-09-06 21:54:12 -07:00
hetong
43a781f59b improvement for reducing warnings 2014-09-06 21:28:42 -07:00
hetong
d214013681 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-06 19:02:56 -07:00
hetong
e04b6aaec5 add documentation for datasets 2014-09-06 19:02:23 -07:00
Tianqi Chen
e7bce3a940 Update xgb.DMatrix.save.R 2014-09-06 18:38:01 -07:00
Tianqi Chen
67fc1dd990 Update xgb.DMatrix.save.R 2014-09-06 18:37:34 -07:00
hetong
99b7ead5ad re-compress the data 2014-09-06 18:29:13 -07:00
hetong
a9bdf38885 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-06 11:23:19 -07:00
tqchen
09e39e5901 chg pack file 2014-09-06 11:21:54 -07:00
hetong
c3cef7e2c7 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-06 11:17:43 -07:00
hetong
f1d7b012a6 refine doc, with Rd 2014-09-06 11:17:38 -07:00
tqchen
515befd4f9 remove runall 2014-09-06 11:15:10 -07:00
tqchen
a42bcaf61f add 2014-09-06 11:14:32 -07:00
tqchen
e9ed4eb1a2 ok 2014-09-06 11:13:19 -07:00
tqchen
7879db8702 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-09-06 10:29:42 -07:00
tqchen
35431e664e add boost from prediction 2014-09-06 10:28:48 -07:00
hetong
166df74024 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-06 10:20:05 -07:00
hetong
a35d93c736 change data from iris back to mushroom 2014-09-06 10:19:46 -07:00
tqchen
4a8612defc add customize objective 2014-09-06 10:19:19 -07:00
tqchen
b858283ec5 add basic walkthrough 2014-09-06 10:11:45 -07:00
hetong
8ad9293437 expose setinfo 2014-09-06 00:44:24 -07:00
hetong
9e05db7261 add mushroom data 2014-09-06 00:26:02 -07:00
hetong
3014ac6778 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-06 00:23:02 -07:00
hetong
bb2c61f7b5 custom eval 2014-09-06 00:16:55 -07:00
tqchen
6157d538c1 check in current iris 2014-09-05 23:22:54 -07:00
hetong
4d00be84c3 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-05 23:04:00 -07:00
hetong
905051b7cb in the middle of guide-r 2014-09-05 23:03:04 -07:00
tqchen
ab238ff831 chg cv 2014-09-05 22:46:09 -07:00
tqchen
831a102d48 add cv 2014-09-05 22:36:59 -07:00
tqchen
0ecd6c08f3 add cross validation 2014-09-05 22:34:32 -07:00
tqchen
bc1817ca2f Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-09-05 20:34:46 -07:00
tqchen
984102e586 style cleanup, incomplete CV 2014-09-05 20:34:41 -07:00
hetong
af07f5135a cleaning 2014-09-05 20:33:39 -07:00
hetong
63dd037db6 add r basic walkthrough 2014-09-05 20:25:38 -07:00
hetong
de08c5a3da remove temp files 2014-09-05 19:49:25 -07:00
hetong
801a17fa02 fix iris to Rd files 2014-09-05 19:47:58 -07:00
hetong
d776e0fdf5 fix iris multiclass problem 2014-09-05 19:22:27 -07:00
Tianqi Chen
2b170ecda4 Merge pull request #69 from giuliohome/fix
Fixing Configuration Type for Win32/Debug.

Thanks Giulio!
2014-09-05 08:41:34 -07:00
giuliohome
59e1e75857 same version
reset changes
2014-09-05 13:37:18 +02:00
giuliohome
1d90288655 Fixing Configuration Type for Win32/Debug
Proposed fix to the main repo
Changed the windows wrapper type to DynamicLibrary. It was already ok
for the Win64/Release. maybe it got lost after latest commit
2014-09-05 13:30:02 +02:00
giuliohome
efbd1b21a6 Merge branch 'tqchen-master' 2014-09-05 13:26:20 +02:00
giuliohome
909a61edac Merge branch 'master' of https://github.com/tqchen/xgboost into tqchen-master
Conflicts:
	README.md
2014-09-05 13:24:45 +02:00
giuliohome
73b627d532 Fixing Configuration Type for Win32/Debug
Proposed fix to the main repo
Changed the windows wrapper type to DynamicLibrary. It was already ok
for the Win64/Release. maybe it got lost after latest commit
2014-09-05 13:08:06 +02:00
tqchen
e8df76b131 make it cleaner 2014-09-04 21:22:02 -07:00
tqchen
80bf8b71f2 OK 2014-09-04 21:21:26 -07:00
tqchen
a9dc145433 add what is new 2014-09-04 21:20:27 -07:00
tqchen
0752b8b9f3 change readme 2014-09-04 21:12:25 -07:00
tqchen
512a0f69fd add glm 2014-09-04 21:09:52 -07:00
tqchen
f9f982a7aa Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-09-04 20:58:05 -07:00
tqchen
a1c6e22af9 add create from csc 2014-09-04 20:57:49 -07:00
tqchen
df3eafc5ba chg mldata to page 2014-09-04 14:20:52 -07:00
antinucleon
1222839efa higgs cv 2014-09-04 11:00:42 -06:00
tqchen
2bc1d2e73a fix doc 2014-09-04 09:23:35 -07:00
tqchen
6c6d00261c small fix to the doc 2014-09-04 09:18:52 -07:00
tqchen
da9c856701 add cv for python 2014-09-03 22:43:55 -07:00
Tianqi Chen
586d6ae740 Update basic_walkthrough.py 2014-09-03 22:05:56 -07:00
Tianqi Chen
d4b62e679d Update README.md 2014-09-03 22:05:13 -07:00
Tianqi Chen
b078c159bd Update README.md 2014-09-03 21:42:28 -07:00
giuliohome
3f11354adb Parallel execution of CV plus double inputted model 2014-09-03 23:14:31 +02:00
tqchen
46cddb80f4 Merge branch 'mastet push origin unityr' into unity 2014-09-03 13:52:11 -07:00
tqchen
5f6e849b21 Merge branch 'unity'
Conflicts:
	src/utils/io.h
	wrapper/xgboost.py
2014-09-03 13:52:03 -07:00
tqchen
8952d9c357 fix 2014-09-03 13:28:03 -07:00
tqchen
b2586b6130 ok 2014-09-03 13:27:06 -07:00
tqchen
5cd92e33f6 remove R for now 2014-09-03 13:24:34 -07:00
tqchen
e6359b5484 ok 2014-09-03 13:23:36 -07:00
tqchen
60e1167b56 fix doc 2014-09-03 13:20:23 -07:00
tqchen
7a61f0dca2 ok 2014-09-03 13:18:36 -07:00
tqchen
c1e0ff0326 push python examples in 2014-09-03 13:15:17 -07:00
tqchen
41ea0bf97a Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-09-03 13:14:00 -07:00
tqchen
fa11840f4b move python example 2014-09-03 13:13:54 -07:00
Tianqi Chen
3192bf82d8 Update xgboost.py 2014-09-03 12:15:57 -07:00
antinucleon
0c36231ea3 chg 2014-09-03 12:57:05 -06:00
tqchen
998ca3bdc9 make some changes to cv 2014-09-03 11:46:33 -07:00
tqchen
244a589e5d change include order, so that Rinternal does not disturb us 2014-09-03 11:31:05 -07:00
antinucleon
2182ebcba1 Merge branch 'master' of github.com:tqchen/xgboost 2014-09-03 00:38:06 -06:00
antinucleon
02dd8d1212 chg 2014-09-03 00:37:55 -06:00
Tianqi Chen
85dbaf638b Update xgboost.Rnw 2014-09-02 23:33:04 -07:00
Tianqi Chen
642b5bda0a Update DESCRIPTION 2014-09-02 23:30:53 -07:00
Tianqi Chen
582ef2f9d5 Update DESCRIPTION 2014-09-02 23:29:48 -07:00
tqchen
06b5533209 chg fobj back to obj, to keep parameter name unchanged 2014-09-02 23:15:41 -07:00
tqchen
ac8958b284 move custom obj build in into booster 2014-09-02 23:07:50 -07:00
tqchen
10648a1ca7 remove using std from cpp 2014-09-02 22:43:19 -07:00
tqchen
1dbcebb6fe fix cxx98 2014-09-02 22:12:28 -07:00
tqchen
65340ffda6 quick lint 2014-09-02 17:51:05 -07:00
tqchen
401d648372 some lint 2014-09-02 17:49:39 -07:00
tqchen
e6e467ad60 more ignore 2014-09-02 17:40:30 -07:00
tqchen
f3360d173b pass trival test 2014-09-02 17:38:51 -07:00
tqchen
226d26d40c still buggy 2014-09-02 17:18:17 -07:00
tqchen
a89e3063e6 untested version of cpage 2014-09-02 15:34:11 -07:00
tqchen
e4817bb4c3 fix ntreelimit 2014-09-02 15:05:49 -07:00
antinucleon
5177fa02e4 adjust weight 2014-09-02 15:22:08 -06:00
tqchen
4b9aeea89c finish the fmatrix 2014-09-02 13:14:54 -07:00
tqchen
76c513b191 t push origin unityMerge branch 'master' into unity 2014-09-02 11:22:57 -07:00
tqchen
eeb04a0603 Merge remote-tracking branch 'origin/unity'
Conflicts:
	R-package/src/Makevars
	R-package/src/Makevars.win
	src/utils/io.h
	wrapper/xgboost.py
2014-09-02 11:22:47 -07:00
tqchen
c75275a861 more movement to beginptr 2014-09-02 11:14:57 -07:00
tqchen
27cabd131e add beginPtr, to make vector address taking safe 2014-09-02 11:01:38 -07:00
tqchen
70219ee1ae move nthread to local var 2014-09-02 09:06:24 -07:00
tqchen
28128a1b6e fix new warning 2014-09-02 09:02:27 -07:00
tqchen
1d5db6877d fix param.h 2014-09-02 08:55:26 -07:00
tqchen
c9f2f47acb fix som solaris 2014-09-02 00:12:15 -07:00
tqchen
bb5c151f57 move sprintf into std 2014-09-01 23:12:50 -07:00
tqchen
29a7027dba fix the zero length vector 2014-09-01 22:50:48 -07:00
tqchen
9100ffc12a chg version 2014-09-01 22:32:03 -07:00
tqchen
42fb7b4d9d some fix to make it more c++ 2014-09-01 22:06:10 -07:00
tqchen
e43bb91185 add matrix builder 2014-09-01 21:30:03 -07:00
tqchen
9d3e09ff2a make rowbatch page flexible 2014-09-01 20:44:15 -07:00
Tianqi Chen
50f1b5d903 Update README.md 2014-09-01 19:00:37 -07:00
Tianqi Chen
b60b23ed1c Update README.md 2014-09-01 18:58:56 -07:00
Tianqi Chen
48411193ae Update README.md 2014-09-01 18:58:00 -07:00
Tianqi Chen
1841d730af Update README.md 2014-09-01 18:55:20 -07:00
Tianqi Chen
85e3fbb06a Update README.md 2014-09-01 18:54:45 -07:00
Tianqi Chen
51a9a36b51 Update DESCRIPTION 2014-09-01 18:53:24 -07:00
hetong
76d5fc7e78 attemp to fix line breaking issue of doc 2014-09-01 17:43:28 -07:00
hetong
19887dcc37 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-09-01 17:24:37 -07:00
hetong
9ee9d29f13 refine readme.md 2014-09-01 17:24:13 -07:00
tqchen
0d5debcc25 fine fix 2014-09-01 17:23:44 -07:00
tqchen
0c5f2b9409 gard GNU c 2014-09-01 17:15:04 -07:00
tqchen
2f6a64e8fa Merge branch 'master' of ssh://github.com/tqchen/xgboost
Conflicts:
	src/utils/omp.h
2014-09-01 17:03:20 -07:00
tqchen
a6ce55493d make R package strict c99 2014-09-01 17:02:42 -07:00
Tong He
d391becb4e Update omp.h 2014-09-01 16:16:06 -07:00
Tong He
ada9dd94ad Update omp.h 2014-09-01 15:51:48 -07:00
hetong
b973a4dcaa improve doc in predict 2014-09-01 15:38:29 -07:00
tqchen
8863c520e7 some quick fix 2014-09-01 15:32:02 -07:00
Tong He
025ca170ec Update predict.xgb.Booster.R 2014-09-01 15:25:16 -07:00
tqchen
6ac6a3d9c9 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-09-01 15:10:29 -07:00
tqchen
4592e500cb add ntree limit 2014-09-01 15:10:19 -07:00
hetong
24e87e1cf8 fix doc with redirection to inst/examples 2014-09-01 15:07:17 -07:00
tqchen
7d1e9f06d4 add fmatrix in, todo add buffer file 2014-09-01 10:45:05 -07:00
tqchen
4c451de90b change message 2014-09-01 09:00:45 -07:00
Tianqi Chen
7393291f81 msvc 2014-09-01 08:59:02 -07:00
tqchen
427ab6434c message 2014-09-01 08:56:40 -07:00
tqchen
6641fa546d change warning to pragma message 2014-09-01 08:50:45 -07:00
tqchen
485e0f140e add 2014-08-31 22:53:35 -07:00
tqchen
8b3465cde0 cleaner makevar 2014-08-31 22:42:15 -07:00
tqchen
b2097b96c7 more clean makevar 2014-08-31 22:39:37 -07:00
tqchen
e3153b976c chgs 2014-08-31 22:25:30 -07:00
tqchen
0a7cfb32c6 add fmatrix, fight tmr 2014-08-31 21:58:01 -07:00
giuliohome
0be4f0032c new theory: predict from cv + parametric rounds 2014-09-01 01:50:07 +02:00
giuliohome
dde22976cf format README 2014-09-01 01:17:29 +02:00
giuliohome
c60649d28c README 2014-09-01 01:16:12 +02:00
giuliohome
2d1430ac01 set NFold CV from cmd args 2014-09-01 01:14:10 +02:00
giuliohome
f1d6429e96 Parametric NFold from cmd args 2014-09-01 01:10:29 +02:00
giuliohome
147b7d33fe NFold Refactoring 2014-09-01 00:50:43 +02:00
Tianqi Chen
b49927e602 Update xgboost_R.cpp 2014-08-31 14:32:45 -07:00
tqchen
79fa8b99d4 pack script with cleanup 2014-08-31 14:26:35 -07:00
tqchen
a3187e932a Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-31 14:15:53 -07:00
tqchen
88da7839b7 fix random 2014-08-31 14:14:39 -07:00
Tianqi Chen
d5f37d1238 add git ignore 2014-08-31 14:13:44 -07:00
tqchen
9e0cc778e8 fix win 2014-08-31 14:12:47 -07:00
tqchen
c1e9acba17 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-31 14:07:51 -07:00
tqchen
168f78623f allow standalone random 2014-08-31 14:07:44 -07:00
Tong He
12d503cec8 Update DESCRIPTION 2014-08-31 13:39:49 -07:00
tqchen
ba4f00d55d Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-31 13:13:19 -07:00
tqchen
1ed40e2b46 more strict makefile 2014-08-31 13:13:11 -07:00
Tianqi Chen
172423ca0c Update README.md 2014-08-31 12:19:44 -07:00
tqchen
37499245ea remove GNUism 2014-08-31 10:26:20 -07:00
Tianqi Chen
4d5ec01cd3 change windows 2014-08-31 09:25:25 -07:00
tqchen
e83090a579 change flagname to pass check 2014-08-31 09:17:49 -07:00
tqchen
bba13af922 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-31 09:13:07 -07:00
tqchen
26c61dc0a3 remove useless flag 2014-08-31 09:12:58 -07:00
Tianqi Chen
d4aacbf8cf add ignore 2014-08-31 09:08:17 -07:00
giuliohome
f42b25ec82 test my inline cv 2014-08-31 18:04:28 +02:00
giuliohome
21f16eac7b fix: cv2 2014-08-31 18:03:12 +02:00
giuliohome
f88aa8d137 fix: submission format 2014-08-31 18:00:34 +02:00
tqchen
fabe2f39e2 more clean makefile 2014-08-31 08:36:17 -07:00
giuliohome
cd0976202b 5 fold cv implementation in c# for the demo: you see inline cv ams while training (of course on a completely separate set) 2014-08-31 17:23:58 +02:00
giuliohome
442d17501f cv1 + cv2 (inline 5-fold cross validation) 2014-08-31 17:09:52 +02:00
giuliohome
23195ac95b Merge branch 'master' of https://github.com/giuliohome/xgboost 2014-08-31 16:31:11 +02:00
giuliohome
04fc25615c Update README.md 2014-08-31 16:28:49 +02:00
giuliohome
318d57f9d0 CV 5-fold implemented 2014-08-31 16:26:42 +02:00
giuliohome
71e5b4c413 Update README.md 2014-08-31 16:13:20 +02:00
giuliohome
41eef462f0 Update README.md 2014-08-31 15:49:34 +02:00
giuliohome
e4ad70e21c Update README.md 2014-08-31 15:41:34 +02:00
giuliohome
e26c072e83 Update README.md 2014-08-31 15:39:20 +02:00
giuliohome
a7b512a1c8 Update README.md 2014-08-31 15:31:16 +02:00
giuliohome
0f28ee4a8e Update README.md 2014-08-31 15:30:48 +02:00
giuliohome
a68f6680a0 Update README.md 2014-08-31 15:29:03 +02:00
giuliohome
82470ef96b Update README.md 2014-08-31 15:28:23 +02:00
hetong
b123fbbcf9 final revision before CRAN 2014-08-30 22:24:25 -07:00
unknown
22a38d8440 move demo to inst/examples 2014-08-30 21:04:47 -07:00
Tong He
b153ffe451 Update DESCRIPTION 2014-08-30 20:46:21 -07:00
Tianqi Chen
629799df0b Update DESCRIPTION 2014-08-30 20:24:23 -07:00
tqchen
f2c8093ba6 check in description 2014-08-30 20:22:36 -07:00
tqchen
104d1d61c7 add license name 2014-08-30 20:06:31 -07:00
tqchen
273816a3b4 chg data 2014-08-30 18:58:32 -07:00
tqchen
9c0389981a fix print problem, fix Tong's email format 2014-08-30 18:49:30 -07:00
Tong He
9739a1c806 Update DESCRIPTION 2014-08-30 18:17:20 -07:00
hetong
257c864274 remove pdf file 2014-08-30 16:26:26 -07:00
hetong
9b618acba2 add import methods in NAMESPACE 2014-08-30 15:42:57 -07:00
hetong
3e85419428 add back import of methdos 2014-08-30 15:34:36 -07:00
hetong
1abdcaa11d eliminate warnings and notes from R CMD check 2014-08-30 15:17:17 -07:00
hetong
a06f01e8ec improve document format 2014-08-30 15:14:36 -07:00
tqchen
e18a4fc5b6 Merge branch 'master' into unity 2014-08-30 15:01:52 -07:00
tqchen
602558c5d6 Merge branch 'unity'
Conflicts:
	R-package/src/Makevars
	R-package/src/Makevars.win
2014-08-30 15:01:36 -07:00
tqchen
2c1aabf6b0 fix indent 2014-08-30 12:47:04 -07:00
tqchen
6e054e8fa4 fix indent 2014-08-30 12:45:46 -07:00
Tianqi Chen
3f7aeb22c5 fix some windows type conversion warning 2014-08-30 12:40:51 -07:00
Tianqi Chen
99c44f2e51 fix makefile in win 2014-08-30 12:25:41 -07:00
hetong
daf430506e Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-30 12:11:40 -07:00
hetong
f9fc1aec2f modify licence and desc to standard format 2014-08-30 12:11:15 -07:00
Tianqi Chen
202a17f148 fix windows 2014-08-30 12:10:50 -07:00
hetong
4cebbdae66 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-30 12:10:41 -07:00
tqchen
74b27bfad2 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-30 12:03:41 -07:00
tqchen
51ef32d73a chg makefile 2014-08-30 12:03:32 -07:00
hetong
70cdd2787c add 00Index 2014-08-30 12:02:01 -07:00
hetong
1b7de855e9 remove logo 2014-08-30 11:53:58 -07:00
hetong
6d36e8460d change getinfo Rd 2014-08-30 11:28:10 -07:00
Tong He
efe8b38a35 fix error in demo 2014-08-30 11:24:15 -07:00
hetong
5e839f6fe7 change location and template of vignette 2014-08-30 10:55:13 -07:00
Tianqi Chen
7845ee0c85 Update CHANGES.md 2014-08-30 09:58:35 -07:00
Tianqi Chen
784ab8d02c Update README.md 2014-08-30 09:58:14 -07:00
Tianqi Chen
86e852d1da edit the doc 2014-08-30 09:31:14 -07:00
giuliohome
6d3eea5056 c# Booster class (almost ready to do cv) 2014-08-30 16:14:09 +02:00
giuliohome
77e967f0e6 Fix: Events Dictionary 2014-08-30 15:19:12 +02:00
giuliohome
473744c5ac conversion from csv to libsvm 2014-08-30 14:55:45 +02:00
giuliohome
b208338098 c# kaggle higgs demo drafted 2014-08-30 10:26:41 +02:00
hetong
84607a34a5 refine vignette 2014-08-29 22:40:07 -07:00
Tianqi Chen
366ac95ad3 windows check 2014-08-29 21:27:03 -07:00
tqchen
9830674b75 seems page is ok, try add col tmr 2014-08-29 21:04:40 -07:00
tqchen
7bc1c3ee79 various fix of page 2014-08-29 20:54:24 -07:00
tqchen
ce772c2f3e first check of page 2014-08-29 19:59:19 -07:00
tqchen
d0e27482ef fix compiler error 2014-08-29 18:44:02 -07:00
tqchen
ce2d34ecd4 check unity back 2014-08-29 18:35:26 -07:00
tqchen
551b3b70f1 check unity back 2014-08-29 18:31:24 -07:00
giuliohome
2587da5fea First example of c# wrapper done (marshalling prediction to submission file) 2014-08-30 03:05:40 +02:00
giuliohome
8b26cba148 eval training 2014-08-30 02:03:00 +02:00
giuliohome
4a67296e30 program cleanse
NEXT TO DO: try to predict after training
2014-08-30 01:43:45 +02:00
giuliohome
ba2d062f09 sharp higgs demo - training 2014-08-30 01:36:04 +02:00
giuliohome
db46e7a730 starting to develop a c# wrapper for xgboost:
c# implementation of kaggle higgs demo
2014-08-30 01:01:30 +02:00
giuliohome
6c3bc36a25 starting to develop a c# wrapper for xgboost 2014-08-30 00:36:01 +02:00
hetong
04c520ea3d refine vignette 2014-08-29 11:53:59 -07:00
hetong
8eb00e3916 refinement of document 2014-08-29 11:43:03 -07:00
hetong
cc12ee0d22 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-29 11:40:37 -07:00
hetong
5f510c683b add vignette 2014-08-29 11:40:15 -07:00
tqchen@graphlab.com
6db4e99b19 improve pack script 2014-08-29 09:47:50 -07:00
unknown
086433da0d add speedtest.R by -f 2014-08-28 22:40:44 -07:00
Tianqi Chen
23e80413f5 Update README.md 2014-08-28 22:34:12 -07:00
Tianqi Chen
6f6d754d4d Update README.md 2014-08-28 22:33:09 -07:00
tqchen
03127fc07e checkin makefile 2014-08-28 22:21:51 -07:00
unknown
b0130545a6 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-28 22:00:44 -07:00
unknown
6ed5d37771 speed test for R, and refinement of item list in doc 2014-08-28 22:00:13 -07:00
tqchen
3e92eb13d3 make it packable 2014-08-28 21:46:12 -07:00
tqchen
2e96bc51f5 do things 2014-08-28 21:23:27 -07:00
unknown
fba591fbf5 add slice document 2014-08-28 09:24:23 -07:00
unknown
26868ebada fix NAMESPACE with import classes 2014-08-28 09:22:11 -07:00
tqchen
8c50cbb6dd checkin slice 2014-08-28 09:04:30 -07:00
tqchen
776e4627de pass pedantic 2014-08-28 08:40:34 -07:00
tqchen
8100006483 fix 2014-08-28 08:34:51 -07:00
hetong
d95bc458e3 fix NAMESPACE 2014-08-28 08:16:45 -07:00
hetong
73419f6cd7 compile Rd files, i.e. R documents 2014-08-28 08:12:48 -07:00
tqchen
df6cd25fd5 OK 2014-08-28 07:43:26 -07:00
tqchen
d79161cfce chg 2014-08-28 07:38:44 -07:00
tqchen
d00302d3ac get a pass in function docstring 2014-08-28 07:35:57 -07:00
unknown
8127f31cdd add documentation notes 2014-08-28 01:44:03 -07:00
unknown
a0f22f6aaa hide xgb.Boost 2014-08-27 22:25:54 -07:00
unknown
8a4e66299a remove default value for nrounds 2014-08-27 22:12:30 -07:00
unknown
4723b8c07e Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-27 21:36:27 -07:00
unknown
6ed5e713d5 ignore csv 2014-08-27 21:35:55 -07:00
Tianqi Chen
b380e0432f Update DESCRIPTION 2014-08-27 21:35:28 -07:00
Tianqi Chen
d7735512cf Delete LICENSE 2014-08-27 21:35:00 -07:00
Tianqi Chen
077c556179 Update DESCRIPTION 2014-08-27 21:34:41 -07:00
Tianqi Chen
ca3141208f Update README.md 2014-08-27 21:32:33 -07:00
Tianqi Chen
af5abc04b3 Update README.md 2014-08-27 21:31:47 -07:00
unknown
b51b913494 modification of higgs-pred.R 2014-08-27 21:31:13 -07:00
Tianqi Chen
8be3249cb8 Update README.md 2014-08-27 21:16:54 -07:00
Tianqi Chen
582e4e3d8c Merge pull request #51 from tqchen/unity
merge unity into master, R package ready
2014-08-27 21:13:38 -07:00
tqchen
12b19c97fa change higgs script, remove R wrapper 2014-08-27 21:13:04 -07:00
tqchen
7ab45b3e64 add files back 2014-08-27 21:07:31 -07:00
Tianqi Chen
de111a1c26 make windows version in 2010 2014-08-27 21:01:39 -07:00
Bing Xu
211d85f04b make py work 2014-08-27 20:55:44 -06:00
tqchen@graphlab.com
4369bc2bfd chg code guide 2014-08-27 19:31:49 -07:00
tqchen@graphlab.com
b162acb858 adapt R package 2014-08-27 19:30:09 -07:00
Tianqi Chen
f9541efa01 Merge pull request #50 from tqchen/master
pull master into unity
2014-08-27 19:19:48 -07:00
tqchen@graphlab.com
075dc9a998 pass build 2014-08-27 19:19:04 -07:00
tqchen@graphlab.com
8aeb038ddd seems ok, need review destructors 2014-08-27 19:12:13 -07:00
tqchen@graphlab.com
f175e1cfb4 finish refactor, need debug 2014-08-27 18:33:52 -07:00
tqchen@graphlab.com
605269133e complete refactor data.h, now replies on iterator to access column 2014-08-27 17:00:21 -07:00
unknown
ae4128fcb2 styling of else in R 2014-08-27 16:46:47 -07:00
Tong He
114cfb2167 fix a tiny bug in xgboost 2014-08-27 15:51:34 -07:00
unknown
b151617ac1 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-27 15:49:26 -07:00
unknown
02df006286 modify readme in R-package 2014-08-27 15:15:22 -07:00
unknown
d693e8d5cc use demo instead of inst 2014-08-27 15:10:07 -07:00
unknown
0f0c12707c modify xgb.getinfo to getinfo 2014-08-27 15:03:24 -07:00
Tianqi Chen
0b5e611c22 Merge pull request #49 from giuliohome/master
Thanks giulio!
2014-08-27 14:49:06 -07:00
giuliohome
f3136c2d92 README 2014-08-27 23:24:57 +02:00
giuliohome
73c42d4574 FIX: If you are using Windows, __declspec(dllexport) is necessary 2014-08-27 23:21:55 +02:00
unknown
a060a2e9a6 remove old R demo files 2014-08-27 13:16:16 -07:00
unknown
247e0d5d78 tidy code by formatR 2014-08-27 13:15:28 -07:00
unknown
4dcc7d7303 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-08-27 12:58:04 -07:00
unknown
d747172d37 refinement of R package 2014-08-27 12:57:37 -07:00
Tianqi Chen
57c0ab2721 Update xgboost.py 2014-08-27 12:27:25 -07:00
Tianqi Chen
2451ba0f1c Merge pull request #48 from giuliohome/master
adding a dll project to the msvc solution for the python wrapper on win64
2014-08-27 12:24:09 -07:00
giuliohome
30b31a6910 win64 python dll project 2014-08-27 20:38:30 +02:00
giuliohome
1383afd8f4 MSVS DLL Project for Python wrapper (ver.3 on win64) 2014-08-27 20:27:05 +02:00
giuliohome
ce1803a40c Merge pull request #1 from tqchen/master
updating fork to current master
2014-08-27 20:17:44 +02:00
tqchen@graphlab.com
a59f8945dc rename SparseBatch to RowBatch 2014-08-27 10:56:55 -07:00
tqchen@graphlab.com
d5a5e0a42a rename findex->index 2014-08-27 10:52:27 -07:00
tqchen@graphlab.com
f3a3470916 make wrapper compile 2014-08-27 10:48:25 -07:00
tqchen@graphlab.com
0fe5470a4f delete extra things 2014-08-27 09:59:39 -07:00
unknown
0130be4acc major change in the design of R interface 2014-08-26 23:41:03 -07:00
Tianqi Chen
84e5fc285b bst_ulong supported by sparsematrix builder 2014-08-26 20:32:33 -07:00
tqchen
414e7f27ff Merge branch 'master' into unity
Conflicts:
	src/learner/evaluation-inl.hpp
	wrapper/xgboost_R.cpp
	wrapper/xgboost_wrapper.cpp
	wrapper/xgboost_wrapper.h
2014-08-26 20:32:07 -07:00
tqchen
4787108b5f change uint64_t to ulong, to make mac happy, this is final change 2014-08-26 20:10:07 -07:00
Tianqi Chen
d00f27dc6b change uint64_t to depend on utils 2014-08-26 20:08:13 -07:00
Tianqi Chen
3e5cb25830 minor fix, add openmp 2014-08-26 20:02:10 -07:00
Tianqi Chen
9d2c1cf9f5 add omp uint when openmp is not there 2014-08-26 19:59:55 -07:00
tqchen
90226035fa chg r package path back 2014-08-26 19:39:34 -07:00
tqchen
7739f57c8b change omp loop var to bst_omp_uint, add XGB_DLL to wrapper 2014-08-26 19:37:04 -07:00
tqchen
97467fe807 chg size_t to uint64_t 2014-08-26 19:12:51 -07:00
tqchen
2623ab0a60 chg size_t to uint64_t unsigned long in wrapper 2014-08-26 19:06:53 -07:00
tqchen
3c1ed847fb remove dependency on bst 2014-08-26 18:06:22 -07:00
Tianqi Chen
636ffaf23b Merge pull request #46 from tqchen/master
merge master into unity
2014-08-26 12:18:26 -07:00
tqchen@graphlab.com
46f14b8c27 fix magic so that it can detect binary file 2014-08-26 12:17:27 -07:00
tqchen@graphlab.com
9eb32b9dd4 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-26 10:24:04 -07:00
tqchen@graphlab.com
2e3c214173 improve makefile 2014-08-26 10:23:57 -07:00
hetong
41d290906f fix NAMESPACE with export method predict 2014-08-26 10:14:29 -07:00
hetong
262108cf3b modify demo filenames 2014-08-26 10:02:13 -07:00
hetong
d9f363632a Merge branch 'master' of https://github.com/tqchen/xgboost
Initial development of R pacakge and merge with the modification from tqchen.
2014-08-26 09:57:38 -07:00
hetong
4940fff55b export fewer functions to user and optimize parameter setting 2014-08-26 09:57:28 -07:00
Tianqi Chen
98e92f1a79 more detailed warning 2014-08-26 09:29:17 -07:00
Tianqi Chen
b1bffde6c9 fix compile under rtools 2014-08-26 09:09:28 -07:00
hetong
5f6d5d19b8 import package methods in desc 2014-08-25 23:01:53 -07:00
tqchen@graphlab.com
a1f1015ae1 add package parameter to all calls, test pass in mac 2014-08-25 22:25:03 -07:00
tqchen
7297c0a92b add openmp flags 2014-08-25 22:14:48 -07:00
tqchen
ddc0970c46 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-25 22:02:19 -07:00
tqchen
0fca16008e runnable 2014-08-25 22:01:35 -07:00
Tianqi Chen
47a0e84c5f add win make 2014-08-25 21:54:24 -07:00
tqchen
c6eaf01a97 add git ignore 2014-08-25 21:25:49 -07:00
tqchen
68f38cf228 initial trial package 2014-08-25 21:20:55 -07:00
Tianqi Chen
c6d59dac4b Merge pull request #45 from tqchen/master
better error handling
2014-08-25 16:00:33 -07:00
tqchen@graphlab.com
c2484f3134 better error handling 2014-08-25 15:58:52 -07:00
tqchen
4c04cf8728 add grow5 back, seems no changes 2014-08-25 14:08:38 -07:00
tqchen
0066cd13a7 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-08-25 13:57:21 -07:00
tqchen
3e9f8bfac9 change things back 2014-08-25 13:56:03 -07:00
tqchen@graphlab.com
6da62159d0 fix by giulio 2014-08-25 12:10:45 -07:00
tqchen@graphlab.com
e26af5e66c Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-08-25 12:08:50 -07:00
tqchen@graphlab.com
b83a96fa21 fix by giulio 2014-08-25 12:08:41 -07:00
tqchen
b708f3f029 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity
Conflicts:
	src/learner/evaluation-inl.hpp
2014-08-25 11:56:59 -07:00
tqchen@graphlab.com
d61b0b757f chg 2014-08-25 11:35:38 -07:00
tqchen@graphlab.com
c78a2164c2 fix line from auto spacing by msvc 2014-08-25 11:34:49 -07:00
tqchen
9e5788a47c Merge branch 'master' into unity 2014-08-25 11:22:37 -07:00
tqchen
e4b9ee22fa :Merge branch 'unity'
Conflicts:
	src/gbm/gbtree-inl.hpp
	src/learner/evaluation-inl.hpp
	src/tree/param.h
2014-08-25 11:21:56 -07:00
Tianqi Chen
bd52a7f448 changes 2014-08-25 11:13:06 -07:00
Tianqi Chen
ca0b008fb0 clean up warnings from msvc 2014-08-25 11:01:21 -07:00
tqchen
fd03239b77 fix now today, try to think how to work tmr 2014-08-24 22:08:21 -07:00
tqchen
f62b4a02f9 beta version, do a review 2014-08-24 21:36:30 -07:00
tqchen
ce97f2fdf8 a fixed version 2014-08-24 21:17:13 -07:00
tqchen
6daa1c365d add cvgrad stats, simplify data 2014-08-24 20:07:16 -07:00
tqchen
c640485f1d initial correction for vec tree 2014-08-24 18:48:19 -07:00
Tianqi Chen
4f0b0d2c88 Merge pull request #43 from tqchen/unity
add changes that are not commited
2014-08-24 17:26:21 -07:00
tqchen
7874c2559b add changes 2014-08-24 17:25:17 -07:00
Tianqi Chen
4c023077dd Merge pull request #42 from tqchen/unity
Unity this is final minor change in data structure
2014-08-24 17:23:46 -07:00
tqchen
da75f8f1a4 move ncol, row to booster, add set/get uint info 2014-08-24 17:19:22 -07:00
tqchen
19447cdb12 chg higgs back 2014-08-24 16:09:13 -07:00
tqchen
4889b40abc tstats now depend on param 2014-08-24 16:08:58 -07:00
tqchen
49e6575c86 add set leaf, constructor of tstats now rely on param 2014-08-24 16:07:59 -07:00
Tianqi Chen
d7c6f8e81a Merge pull request #41 from tqchen/unity
Unity
2014-08-24 15:24:20 -07:00
tqchen
ba9fbd380c templatize refresher 2014-08-24 15:22:11 -07:00
tqchen
f71b732e7a refactor grad stats to be like visitor 2014-08-24 15:17:22 -07:00
Tianqi Chen
c0496685c4 Merge pull request #39 from tqchen/unity
fix mac compile issue
2014-08-24 09:52:03 -07:00
tqchen
d49c6e6e84 fix 2014-08-24 09:51:15 -07:00
tqchen
88beee5639 try to fix compile bug 2014-08-24 09:47:08 -07:00
tqchen@graphlab.com
46d41a2b43 fix compilation on mac 2014-08-24 09:32:06 -07:00
Tianqi Chen
40483e6dc3 Merge pull request #38 from tqchen/unity
Unity
2014-08-23 21:16:14 -07:00
tqchen
b381c842f1 link glc 2014-08-23 21:14:53 -07:00
tqchen
5802141d59 add glc comment 2014-08-23 21:12:55 -07:00
Tianqi Chen
cf274e76f4 Merge pull request #37 from tqchen/unity
Unity
2014-08-23 20:54:27 -07:00
tqchen
fea7245fa0 chg python back 2014-08-23 20:53:56 -07:00
tqchen
d16a56814b remove pred.csv 2014-08-23 20:53:16 -07:00
tqchen
ed9d8a1c0e add higgs example 2014-08-23 20:52:56 -07:00
Tianqi Chen
851f3fce86 Merge pull request #36 from tqchen/unity
add acknowledgement
2014-08-23 19:05:22 -07:00
tqchen
d86cd62415 add acknowledgement 2014-08-23 19:04:50 -07:00
Tianqi Chen
cd16a3b124 Merge pull request #35 from tqchen/unity
ok
2014-08-23 18:59:52 -07:00
tqchen
a656e61571 ok 2014-08-23 18:57:19 -07:00
Tianqi Chen
b2b5895634 Merge pull request #34 from tqchen/unity
Unity
2014-08-23 18:56:38 -07:00
tqchen
3b12ff51b9 seems ok 2014-08-23 18:38:39 -07:00
tqchen
de83ac72ea complete R example 2014-08-23 15:26:08 -07:00
tqchen
8bf758c63b chg wrapper 2014-08-23 14:27:56 -07:00
tqchen
08a6b92216 chg 2014-08-23 14:20:29 -07:00
tqchen
3ba7995754 finish dump 2014-08-23 13:09:47 -07:00
tqchen
40da2fa2c0 workable R wrapper 2014-08-23 12:14:44 -07:00
tqchen
5e23f6577f try add R wrapper 2014-08-23 09:30:02 -07:00
tqchen
9d210f9bd3 ok 2014-08-22 20:14:43 -07:00
Tianqi Chen
741bfe015f Merge pull request #32 from tqchen/master
merge master into unity
2014-08-22 20:13:23 -07:00
Tianqi Chen
13b5269855 Update machine.conf 2014-08-22 20:00:04 -07:00
Tianqi Chen
cf69d34d06 Update mq2008.conf 2014-08-22 19:59:30 -07:00
Tianqi Chen
4378f1f039 Update mushroom.conf 2014-08-22 19:58:59 -07:00
Tianqi Chen
3acd10e031 Merge pull request #31 from tqchen/unity
Change master branch into unity
2014-08-22 19:54:48 -07:00
tqchen
58cda4d708 ok 2014-08-22 19:53:52 -07:00
tqchen
104fced9c3 ok 2014-08-22 19:52:43 -07:00
tqchen
ce5b776bdc add change note 2014-08-22 19:47:05 -07:00
tqchen
07ddf98718 add log 2014-08-22 19:41:58 -07:00
tqchen
2ac8cdb873 check in linear model 2014-08-22 19:27:33 -07:00
tqchen
37b707e110 clean up 2014-08-22 16:51:27 -07:00
tqchen
bf71cf52be add 2014-08-22 16:50:28 -07:00
tqchen
24030b26fd add 2014-08-22 16:49:42 -07:00
tqchen
edc539a024 add message about glc 2014-08-22 16:47:50 -07:00
tqchen
4ed67b9c27 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-08-22 16:26:45 -07:00
tqchen
58354643b0 chg root index to booster info, need review 2014-08-22 16:26:37 -07:00
tqchen
a45fb2d737 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-08-22 16:10:23 -07:00
tqchen
3f5b5e1fdc add apratio 2014-08-22 16:10:19 -07:00
tqchen
58d74861b9 fix multiclass 2014-08-22 14:29:32 -07:00
tqchen@graphlab.com
1fd6ff817f ok 2014-08-19 12:20:31 -07:00
tqchen@graphlab.com
9caccd3b36 change row subsample to prob 2014-08-19 12:07:52 -07:00
tqchen@graphlab.com
91e70c76ff refresher test 2014-08-19 11:41:35 -07:00
tqchen
762b360739 fix typo 2014-08-19 08:42:36 -07:00
tqchen
e7de77aa1f chg 2014-08-19 08:08:54 -07:00
tqchen
406db647f2 add pratio 2014-08-19 08:05:05 -07:00
tqchen
fdba6e9c46 add pratio 2014-08-19 08:02:29 -07:00
tqchen
d08d8ed3ed add tree refresher, need review 2014-08-18 21:32:48 -07:00
tqchen
f757520c02 add tree refresher, need review 2014-08-18 21:32:31 -07:00
tqchen
dbf3a21942 change dense fvec logic to tree 2014-08-18 19:03:32 -07:00
tqchen
1d8c2391e8 update tree maker to make it more robust 2014-08-18 14:58:30 -07:00
tqchen
3de07b0abe add more guideline about python path 2014-08-18 14:12:35 -07:00
tqchen@graphlab.com
3b02fb26b0 fix num parallel tree 2014-08-18 13:33:58 -07:00
tqchen@graphlab.com
c4b21775fa some lint 2014-08-18 12:57:31 -07:00
antinucleon
e9bfc026b7 fix typo 2014-08-18 13:38:09 -06:00
antinucleon
0b36c8295d lack include 2014-08-18 13:33:36 -06:00
tqchen@graphlab.com
9da2ced8a2 add base_margin 2014-08-18 12:20:13 -07:00
tqchen@graphlab.com
46fed899ab add more note 2014-08-18 10:57:08 -07:00
tqchen@graphlab.com
f6c763a2a7 fix base score, and print message 2014-08-18 10:53:15 -07:00
tqchen@graphlab.com
04e04ec5a0 chg readme 2014-08-18 10:19:47 -07:00
tqchen@graphlab.com
66ae3a7578 add no omp flag 2014-08-18 10:17:49 -07:00
tqchen@graphlab.com
7c068cbe46 fix mac 2014-08-18 10:14:34 -07:00
tqchen
d3bfc31e6a enforce putting iteration numbers in train 2014-08-18 09:00:23 -07:00
tqchen
3c1c7e2780 Merge branch 'unity' of ssh://github.com/tqchen/xgboost into unity 2014-08-18 08:57:45 -07:00
tqchen
e912dd3364 fix omp 2014-08-18 08:57:26 -07:00
Bing Xu
b76853731c make it compatible with old code 2014-08-18 02:10:54 -04:00
tqchen
0d9a8c042c make xgcombine buffer work 2014-08-17 22:49:36 -07:00
tqchen
4ed4b08146 ok 2014-08-17 20:47:20 -07:00
tqchen
5a472145de check in rank loss 2014-08-17 20:32:02 -07:00
tqchen
9df8bb1397 check in softmax multiclass 2014-08-17 19:16:17 -07:00
tqchen
e77df13815 ok 2014-08-17 18:49:54 -07:00
tqchen
301685e0a4 python module pass basic test 2014-08-17 18:43:25 -07:00
tqchen
af100dd869 remake the wrapper 2014-08-17 17:43:46 -07:00
tqchen
2c969ecf14 first version that reproduce binary classification demo 2014-08-16 15:44:35 -07:00
tqchen
c4acb4fe01 check in io module 2014-08-16 14:06:31 -07:00
tqchen
ac1cc15b90 pass fmatrix as const 2014-08-15 21:24:23 -07:00
tqchen
d9dbd1efc6 modify readme 2014-08-15 21:06:44 -07:00
tqchen
34dd409c5b mv code into src 2014-08-15 21:04:23 -07:00
tqchen
3589e8252f refactor config 2014-08-15 21:02:33 -07:00
tqchen
dafa44753a chg readme 2014-08-15 20:22:54 -07:00
tqchen
2a92c82b92 start unity refactor 2014-08-15 20:15:58 -07:00
tqchen@graphlab.com
5b215742c2 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-15 13:36:56 -07:00
tqchen@graphlab.com
5edc4f3775 save name_obj from now 2014-08-15 13:36:19 -07:00
Tianqi Chen
6d7b33a883 Update README.md 2014-08-12 14:57:28 -07:00
Tianqi Chen
f033f88221 Update README.md 2014-08-12 14:57:05 -07:00
Tianqi Chen
048194ce23 Update README.md 2014-08-12 14:56:51 -07:00
Tianqi Chen
e7ae704504 Update README.md 2014-08-12 14:56:12 -07:00
tqchen
662733db31 support for multiclass output prob 2014-08-01 11:21:17 -07:00
Tianqi Chen
8b4f7d7fa2 Update xgboost_regrank.h 2014-07-12 10:14:30 -07:00
Tianqi Chen
497fc86998 Merge pull request #16 from smly/minor-leak
fix (trivial) leak in xgboost_regrank, Thanks for the fix
2014-07-12 09:58:07 -07:00
Kohei Ozaki
0516d09938 fix (trivial) leak in xgboost_regrank 2014-07-12 17:29:49 +09:00
tqchen
1620cfc9e8 fix combine buffer 2014-05-25 16:46:03 -07:00
tqchen
ec62953e54 add rand seeds back 2014-05-25 10:18:04 -07:00
tqchen
86515a2c15 ok 2014-05-25 10:15:57 -07:00
Tianqi Chen
1048561ede change rank order output to follow kaggle convention 2014-05-25 10:08:38 -07:00
tqchen
6abfce620c make python random seed invariant in each round 2014-05-24 20:57:39 -07:00
tqchen
e2999a0efb fix sometimes python cachelist problem 2014-05-20 15:42:19 -07:00
tqchen
89a2fc5e94 more clean demo 2014-05-20 08:33:35 -07:00
tqchen
ea3bf5d57e fix bug in classification, scale_pos_weight initialization 2014-05-20 08:30:19 -07:00
tqchen
f4dedc4d2d chg 2014-05-19 10:02:01 -07:00
Tianqi Chen
1b9372f431 Merge pull request #7 from jrings/master
Compatibility with both Python 2(.7) and 3
2014-05-19 09:48:34 -07:00
Joerg Rings
93d83ca077 Compatibility with both Python 2(.7) and 3 2014-05-19 11:23:53 -05:00
Tianqi Chen
991634a58e Merge pull request #6 from tqchen/dev
Fix the bug in MAC
2014-05-17 11:07:42 -07:00
tqchen
7aae2ec009 add omp flag back 2014-05-17 11:07:12 -07:00
tqchen
1afe894a63 use back g++ 2014-05-17 11:06:36 -07:00
tqchen
29363d6100 force handle as void_p, seems fix mac problem 2014-05-17 11:03:21 -07:00
Tianqi Chen
049e8cfb2d Merge pull request #5 from tqchen/dev
add return type for xgboost, don't know if it is mac problem. #4
2014-05-17 09:19:20 -07:00
tqchen
2507e4403a add return type for xgboost, don't know if it is mac problem 2014-05-17 09:13:54 -07:00
Tianqi Chen
007f60a352 Update README.md 2014-05-16 22:54:24 -07:00
Tianqi Chen
85108e6a65 Merge pull request #2 from tqchen/dev
fix loss_type
2014-05-16 21:30:09 -07:00
tqchen
3975bf1e62 some cleanup 2014-05-16 21:29:14 -07:00
tqchen
baed0d0f08 fix for loss_type problem in outside reset base 2014-05-16 21:28:03 -07:00
tqchen
bf473bd6c8 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-05-16 20:58:03 -07:00
tqchen
71fc734d3b chg 2014-05-16 20:57:54 -07:00
antinucleon
9f3e5a2778 del 2014-05-17 03:57:38 +00:00
Tianqi Chen
59a9b6b325 Merge pull request #1 from tqchen/dev
2.0 version, lots of changes
2014-05-16 20:53:19 -07:00
Tianqi Chen
8e941b2a79 Update README.md 2014-05-16 20:49:05 -07:00
tqchen
877bac216c Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:46:18 -07:00
tqchen
348d35a668 add ignore 2014-05-16 20:46:08 -07:00
tqchen
d7bb10eb79 final check 2014-05-16 20:44:02 -07:00
Tianqi Chen
4dadc76652 Update README.md 2014-05-16 20:41:59 -07:00
Tianqi Chen
4218c1ef53 Update README.md 2014-05-16 20:41:43 -07:00
Tianqi Chen
32a3371073 Update README.md 2014-05-16 20:41:21 -07:00
Tianqi Chen
58cbfa0692 Update README.md 2014-05-16 20:41:05 -07:00
tqchen
51482a29bf Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:37:55 -07:00
tqchen
d429289ad3 ok 2014-05-16 20:37:45 -07:00
yepyao
1cf41066d9 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-17 11:36:12 +08:00
yepyao
391be10806 small change 2014-05-17 11:35:43 +08:00
yepyao
255bad90cb small change 2014-05-17 11:34:24 +08:00
tqchen
84afaaaa7d Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:29:17 -07:00
tqchen
b07ff1ac8d fix softmax 2014-05-16 20:28:07 -07:00
antinucleon
3e4dd2fce0 chg 2014-05-16 21:27:37 -06:00
tqchen
6c72d02205 chg 2014-05-16 20:18:34 -07:00
Tianqi Chen
cfd6c9e3b7 Update train.py 2014-05-16 20:16:10 -07:00
tqchen
8e5e3340a2 multi class 2014-05-16 20:12:04 -07:00
antinucleon
f52f7b7899 demo 2014-05-16 21:05:11 -06:00
antinucleon
f971d1b554 Merge branch 'dev' of github.com:tqchen/xgboost into dev 2014-05-16 21:03:32 -06:00
Tianqi Chen
7537d691d9 Update README.md 2014-05-16 20:00:20 -07:00
antinucleon
c67b098bd6 demo 2014-05-17 02:59:10 +00:00
antinucleon
d05cb13751 demo 2014-05-16 20:57:42 -06:00
tqchen
2cae28087a do not need to dump in rank 2014-05-16 19:52:39 -07:00
tqchen
12bf54d4ef Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 19:51:41 -07:00
tqchen
6a9438ac86 before commit 2014-05-16 19:51:33 -07:00
yepyao
c4a783f408 small change 2014-05-17 10:50:15 +08:00
yepyao
e872f488a5 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts:
	demo/rank/mq2008.conf
	demo/rank/runexp.sh
	regrank/xgboost_regrank_obj.h
2014-05-17 10:40:12 +08:00
yepyao
e565916c1c fix small bug 2014-05-17 10:35:10 +08:00
tqchen
a70454e3ce add bing to author list 2014-05-16 19:33:59 -07:00
Tianqi Chen
1150fb59a8 Update demo.py 2014-05-16 19:30:32 -07:00
tqchen
53633ae9c2 chgs 2014-05-16 19:24:53 -07:00
tqchen
98e507451c chg all settings to obj 2014-05-16 19:10:52 -07:00
tqchen
213375baca pre-release version 2014-05-16 18:49:02 -07:00
tqchen
8a0f8a93c7 chg scripts 2014-05-16 18:46:43 -07:00
tqchen
02cefb8f1b cleanup 2014-05-16 18:40:46 -07:00
tqchen
bee87cfce7 chg rank demo 2014-05-16 18:38:40 -07:00
tqchen
4743cc98ec Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 18:29:37 -07:00
tqchen
bf66d31b49 chng few things 2014-05-16 18:25:01 -07:00
tqchen
c67b4d1864 minor changes 2014-05-16 18:19:57 -07:00
antinucleon
4bf23cfbb1 new speed test 2014-05-16 18:05:17 -06:00
antinucleon
4bcf947408 speedtest 2014-05-16 17:48:03 -06:00
yepyao
4d03729683 use ndcg@all in lambdarank for ndcg 2014-05-16 23:06:24 +08:00
yepyao
5db373e73c small change 2014-05-16 21:20:41 +08:00
yepyao
e3a0c0efe5 Download data set from web site 2014-05-16 21:18:32 +08:00
kalenhaha
07e98254f5 Impement new Lambda rank interface 2014-05-16 20:42:46 +08:00
tqchen
2baeeabac4 new lambda rank interface 2014-05-16 00:02:26 -07:00
Bing Xu
da0bb3f44e Update README.md 2014-05-16 01:30:29 -04:00
tqchen
92d1df2d2e ok 2014-05-15 21:17:17 -07:00
tqchen
6af6d64f0b a correct version 2014-05-15 21:11:46 -07:00
tqchen
2be3f6ece0 fix numpy convert 2014-05-15 20:28:34 -07:00
tqchen
a7f3d7edd7 ok 2014-05-15 20:05:22 -07:00
tqchen
c22df2b31a ok 2014-05-15 18:56:28 -07:00
tqchen
e2d13db24e bug fix in pairwise rank 2014-05-15 15:37:58 -07:00
tqchen
37e1473cea cleanup code 2014-05-15 15:01:41 -07:00
tqchen
3960ac9cb4 add xgcombine_buffer with weights 2014-05-15 14:41:11 -07:00
tqchen
a59969cd52 change data format to include weight in binary file, add get weight to python 2014-05-15 14:37:56 -07:00
tqchen
3cb42d3f87 ok 2014-05-15 14:25:44 -07:00
tqchen
88526668f5 add ams 2014-05-14 23:23:27 -07:00
tqchen
31a0823e6d some fix 2014-05-14 16:55:59 -07:00
tqchen
ae9d937510 add AMS metric 2014-05-14 11:30:45 -07:00
kalenhaha
121348c0d7 add in grad and hess rescale in lambdarank 2014-05-14 23:13:27 +08:00
kalenhaha
671c34be63 small bug in ndcg eval 2014-05-13 14:30:42 +08:00
kalenhaha
8967be4af5 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-12 22:22:32 +08:00
kalenhaha
5411e2a500 Add LETOR MQ2008 for rank demo 2014-05-12 22:21:07 +08:00
kalenhaha
e858523d19 remove sampler 2014-05-11 14:31:57 +08:00
kalenhaha
6648a15817 small change 2014-05-11 14:25:30 +08:00
kalenhaha
faf35c409e small change 2014-05-11 14:03:21 +08:00
tqchen
604568b512 simple chgs 2014-05-09 20:39:15 -07:00
kalenhaha
f7b2281510 fix some warnings 2014-05-09 14:14:43 +08:00
kalenhaha
0794dd0f6f Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-09 14:07:06 +08:00
kalenhaha
4b6024c563 Separating Lambda MAP and Lambda NDCG 2014-05-09 14:05:52 +08:00
tqchen
41edad7b3d add python o3 2014-05-08 20:15:23 -07:00
tqchen
2ccd28339e faster convert to numpy array 2014-05-08 19:35:06 -07:00
tqchen
a0c0fbbb61 commit the fix 2014-05-08 19:31:32 -07:00
tqchen
06327ff8d0 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-07 12:00:17 -07:00
tqchen
0bf6261961 fix omp for bug in obj 2014-05-07 11:52:12 -07:00
kalenhaha
8b3fc78999 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts:
	regrank/xgboost_regrank_obj.hpp
2014-05-07 22:15:59 +08:00
tqchen
833cf29867 fix 2014-05-06 16:53:37 -07:00
tqchen
4b00b3e565 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-06 16:51:18 -07:00
tqchen
abe5309977 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev
Conflicts:
	regrank/xgboost_regrank_data.h
2014-05-06 16:51:11 -07:00
tqchen
7ddff7b570 add regrank utils 2014-05-06 16:50:46 -07:00
tqchen
c39e1f2f30 right group size 2014-05-06 16:49:10 -07:00
tqchen
4f9833ed76 add cutomized training 2014-05-04 13:57:10 -07:00
tqchen
9bc699fd0e add cutomized training 2014-05-04 13:55:58 -07:00
tqchen
8c0c10463e add boost group support to xgboost. now have beta multi-class classification 2014-05-04 12:10:03 -07:00
kalenhaha
8eae8d956d c++11 features removed 2014-05-04 16:58:44 +08:00
kalenhaha
7161618b4c c++11 features removed 2014-05-04 16:56:57 +08:00
tqchen
21f93ffd6a fix 2014-05-04 00:09:16 -07:00
tqchen
2057dda560 add interact mode 2014-05-03 23:24:22 -07:00
tqchen
6fd77cbb24 add python interface for xgboost 2014-05-03 23:04:02 -07:00
tqchen
adc9400736 finish python lib 2014-05-03 22:18:25 -07:00
tqchen
20de7f8f97 finish matrix 2014-05-03 17:12:25 -07:00
tqchen
5bab27cfa6 good 2014-05-03 16:15:44 -07:00
tqchen
30e725a28c ok 2014-05-03 14:24:00 -07:00
tqchen
aab1b0e7b3 important change to regrank interface, need some more test 2014-05-03 14:20:27 -07:00
tqchen
2305ea7af7 try python 2014-05-03 10:54:08 -07:00
tqchen
c1223bfdef pass test 2014-05-02 18:04:45 -07:00
tqchen
cc91c73160 add new combine tool as promised 2014-05-02 12:55:34 -07:00
tqchen
cbceeb8ca6 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-01 11:01:05 -07:00
tqchen
ef7df40bc8 cleanup of evaluation metric, move c++11 codes into sample.h for backup, add lambda in a clean way latter 2014-05-01 11:00:50 -07:00
Tianqi Chen
f93ccda075 Update xgboost_omp.h 2014-05-01 10:16:05 -07:00
kalenhaha
f17d400fd3 fix some bugs in linux 2014-05-02 00:16:12 +08:00
kalenhaha
b836b1123e lambda rank added 2014-05-01 22:17:26 +08:00
tqchen
bf64608cc9 add softmax 2014-04-30 22:11:26 -07:00
tqchen
54c482ffd5 add pre @ n 2014-04-30 22:00:53 -07:00
tqchen
223bb5638b use omp parallel sortting 2014-04-30 09:48:41 -07:00
tqchen
bb93c0aaac add rank 2014-04-30 09:32:42 -07:00
tqchen
a383f11759 add pairwise rank first version 2014-04-29 21:12:30 -07:00
tqchen
81414c0e5b new AUC code 2014-04-29 17:26:58 -07:00
tqchen
87a9c22795 new AUC evaluator, now compatible with weighted loss 2014-04-29 17:03:34 -07:00
tqchen
31edfda03c make regression module compatible with rank loss, now support weighted loss 2014-04-29 16:16:02 -07:00
tqchen
7a79c009ce chg fmap format 2014-04-29 09:59:10 -07:00
tqchen
ea354683b4 add auc evaluation metric 2014-04-24 22:20:40 -07:00
tqchen
7f9637aae4 remove unwanted private field 2014-04-21 10:42:19 -07:00
tqchen
5f0018b070 expose fmatrixs 2014-04-18 18:18:19 -07:00
tqchen
c3592dc06c Merge branch 'master' of ssh://github.com/tqchen/xgboost
Conflicts:
	regression/xgboost_reg_data.h
2014-04-18 17:46:44 -07:00
tqchen
3d327503fd simplify data 2014-04-18 17:43:44 -07:00
kalenhaha
91bb4777b0 Lambda rank added 2014-04-11 10:50:13 +08:00
kalenhaha
efeea99283 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-04-11 10:48:45 +08:00
kalenhaha
07eea71010 Lambda rank added 2014-04-10 22:11:15 +08:00
kalenhaha
c8b2f46b89 lambda rank added 2014-04-10 22:09:19 +08:00
Tianqi Chen
a022a783ce Update xgboost_utils.h 2014-04-07 16:25:21 -07:00
kalenhaha
a10f594644 rank pass toy 2014-04-07 23:25:35 +08:00
tqchen
40c380e40a add deleted main back 2014-04-06 09:32:27 -07:00
kalenhaha
1fa367b220 small fix 2014-04-06 22:54:41 +08:00
kalenhaha
6bc71df494 compiled 2014-04-06 22:51:52 +08:00
tqchen
ddb8a6982c add dev 2014-04-04 10:42:13 -07:00
kalenhaha
c62dea8325 pairwise ranking implemented 2014-04-05 00:14:55 +08:00
kalenhaha
0b1e584d73 Adding ranking task 2014-04-03 16:22:55 +08:00
tqchen
dc239376c7 add dump nice to regression demo 2014-03-26 16:47:01 -07:00
tqchen
7d97d6b1d4 update regression 2014-03-26 16:25:44 -07:00
kalenhaha
0a971cb466 small fix 2014-03-27 00:08:47 +08:00
kalenhaha
52992442ad Merge branch 'master' of https://github.com/tqchen/xgboost 2014-03-26 23:50:56 +08:00
tqchen
c751d6ead3 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-03-25 17:18:27 -07:00
tqchen
c7869a7855 small fix 2014-03-25 17:17:00 -07:00
Tianqi Chen
87fc848b12 Update README.md 2014-03-26 08:01:47 +08:00
Tianqi Chen
159ed0f7e1 Update README.md 2014-03-26 08:01:24 +08:00
Tianqi Chen
f7d9c774d7 Update README 2014-03-26 07:21:15 +08:00
kalenhaha
feb914c35b change the regression demo data set 2014-03-24 23:23:11 +08:00
tqchen
d93e8717c1 fix test to pred 2014-03-24 00:31:53 -07:00
kalenhaha
57713be940 remove test directory 2014-03-23 00:05:46 +08:00
kalenhaha
77901f2428 adding regression demo 2014-03-22 21:52:29 +08:00
kalenhaha
55d1b1e109 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-03-22 21:50:31 +08:00
kalenhaha
193d1d165f separate binary classification and regression demo 2014-03-22 21:48:27 +08:00
Tianqi Chen
bc071cac4f Update README.md 2014-03-20 23:12:41 -07:00
Tianqi Chen
50c76ec0d3 Update README.md 2014-03-20 23:12:16 -07:00
tqchen
db285cc4ba add batch running 2014-03-20 16:27:24 -07:00
tqchen
255b1f4043 add feature constraint 2014-03-19 10:47:56 -07:00
tqchen
d3fe4b26a9 fixed remove bug 2014-03-13 13:42:40 -07:00
tqchen
c13126191d neglok 2014-03-12 20:28:21 -07:00
tqchen
8c8dd1a740 support int type 2014-03-12 17:58:14 -07:00
tqchen
329cc61795 more compact 2014-03-11 13:07:20 -07:00
tqchen
a191863213 add accuracy 2014-03-11 13:06:22 -07:00
tqchen
d9ff9fadf6 fix delete 2014-03-11 12:40:51 -07:00
tqchen
377a573097 add remove tree 2014-03-11 11:25:50 -07:00
tqchen
364b4a0f77 add name dumpath 2014-03-06 11:23:51 -08:00
tqchen
d960550933 add add and remove 2014-03-05 16:39:07 -08:00
tqchen
ef5a389ecf try interact mode 2014-03-05 15:28:53 -08:00
tqchen
2bdcad9630 add a test folder 2014-03-05 15:20:11 -08:00
tqchen
74828295fe complete row maker 2014-03-05 14:38:13 -08:00
tqchen
73dfdc539b add row tree maker, to be finished 2014-03-05 11:00:03 -08:00
tqchen
cf14b11130 split new base treemaker, not very good abstraction, but ok 2014-03-05 10:20:36 -08:00
tqchen
8ef7d6beb4 fix reg model_out 2014-03-05 09:34:37 -08:00
tqchen
0fdda29470 reupdate data 2014-03-04 22:47:39 -08:00
tqchen
1479adba58 fix text 2014-03-04 16:22:24 -08:00
tqchen
ae5c26daf6 fix fmatrix 2014-03-04 11:45:22 -08:00
tqchen
ffcfb12515 add simple text loader 2014-03-04 11:33:33 -08:00
tqchen
cba130c40c ok fix 2014-03-03 22:20:45 -08:00
tqchen
9da9861377 big change, change interface to template, everything still OK 2014-03-03 22:16:37 -08:00
tqchen
fad6522a53 backup makefile 2014-03-03 15:21:50 -08:00
tqchen
bbbbe6bc4e compatibility issue with openmp 2014-03-03 15:11:41 -08:00
tqchen
5a65f4b958 ok 2014-03-03 12:26:40 -08:00
tqchen
f0b38810bb maptree is not needed 2014-03-03 11:06:24 -08:00
tqchen
623e003923 fix fmap 2014-03-03 11:05:10 -08:00
tqchen
074a861e7b auto do reboost 2014-03-02 16:42:22 -08:00
tqchen
d534c22094 chg file name of reg 2014-03-02 16:39:00 -08:00
tqchen
4ebdd3cdd2 chg file name of reg 2014-03-02 16:38:59 -08:00
tqchen
c2460da2ab change test task to pred 2014-03-02 16:20:42 -08:00
tqchen
2dd03b1963 make style more like Google style 2014-03-02 13:30:24 -08:00
tqchen
7761d562b1 add smart decision of nfeatures 2014-03-01 21:49:29 -08:00
tqchen
0f410ac54a fix type 2014-03-01 21:29:07 -08:00
tqchen
75427938c3 add smart load 2014-03-01 21:15:54 -08:00
tqchen
5cdc38648b full omp support for regression 2014-03-01 20:56:25 -08:00
tqchen
550010e9d2 fix col maker, make it default 2014-03-01 15:16:30 -08:00
tqchen
394d325078 add col maker 2014-03-01 14:00:09 -08:00
Tianqi Chen
1f04893784 Update README.md 2014-02-28 20:13:01 -08:00
Tianqi Chen
260cbcd3c0 Update README.md 2014-02-28 20:10:57 -08:00
tqchen
e4a4f7d315 chg license, README 2014-02-28 20:09:40 -08:00
tqchen
b57656902e start add coltree maker 2014-02-28 11:44:50 -08:00
tqchen
82807b3a55 add dump2json 2014-02-26 18:54:12 -08:00
tqchen
733f8ae393 add pathdump 2014-02-26 17:08:23 -08:00
tqchen
4a612eb3ba modify tree so that training is standalone 2014-02-26 16:03:00 -08:00
tqchen
2c6922f432 modify tree so that training is standalone 2014-02-26 16:02:58 -08:00
tqchen
9b09cd3d49 change input data structure 2014-02-26 11:51:58 -08:00
tqchen
6fa5c30777 fix mushroom 2014-02-24 23:19:58 -08:00
tqchen
c4949c0937 finish mushroom 2014-02-24 23:06:57 -08:00
tqchen
9d6ef11eb5 add mushroom classification 2014-02-24 22:25:43 -08:00
tqchen
4aa4faa625 add mushroom 2014-02-24 22:19:40 -08:00
tqchen
daab1fef19 pass simple test 2014-02-20 22:28:05 -08:00
tqchen
e52720976c changes to reg booster 2014-02-20 22:08:31 -08:00
kalenhaha
a0dddaf224 tab eliminated 2014-02-19 13:25:01 +08:00
kalenhaha
a20b1d1866 add toy data 2014-02-19 13:01:15 +08:00
kalenhaha
e1b5b99113 add in reg.conf for configuration demo 2014-02-18 16:49:23 +08:00
kalenhaha
7821ef3a7c Merge branch 'master' of https://github.com/tqchen/xgboost 2014-02-16 14:34:35 +08:00
kalenhaha
6d500b2964 fix some bugs 2014-02-16 11:44:03 +08:00
tqchen
f204dd7fcf fix nboosters 2014-02-15 19:42:02 -08:00
tqchen
c38399b989 update license 2014-02-15 17:45:48 -08:00
tqchen
ece5f00ca1 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-02-15 17:42:31 -08:00
tqchen
db938ff595 update license 2014-02-15 17:42:23 -08:00
tqchen
5c09686c78 Update README.md 2014-02-15 11:22:50 -08:00
kalenhaha
32e670a4da Comments added 2014-02-13 13:04:55 +08:00
kalenhaha
4dfc4491c2 GBRT Train and Test Phase added 2014-02-12 23:30:32 +08:00
tqchen
d6261c25f2 Update README.md 2014-02-11 20:38:06 -08:00
tqchen
bf81263301 chg fmt to libsvm 2014-02-10 21:41:43 -08:00
tqchen
45a452b27e cleanup reg 2014-02-10 21:09:09 -08:00
tqchen
56e4a2ced1 add regression data 2014-02-10 20:32:23 -08:00
kalenhaha
4d1d3712ea Merge branch 'master' of https://github.com/tqchen/xgboost 2014-02-11 11:19:27 +08:00
kalenhaha
fb568a7a47 gbrt modified 2014-02-11 11:07:00 +08:00
kalenhaha
3afd186ea9 gbrt implemented 2014-02-10 23:40:38 +08:00
tqchen
365b8c4bdc Update README.md 2014-02-08 19:02:33 -08:00
tqchen
6c38e35ffb Update README.md 2014-02-08 13:01:10 -08:00
tqchen
08604d35fc Update README.md 2014-02-08 13:00:49 -08:00
tqchen
52058735d0 Update README.md 2014-02-08 12:50:24 -08:00
tqchen
6a43247bc3 finish readme 2014-02-08 11:47:37 -08:00
tqchen
33acaaa3ae add linear booster 2014-02-08 11:24:35 -08:00
tqchen
d656d9df2c add ok 2014-02-07 22:51:16 -08:00
tqchen
e8feddc6a8 chg makefile 2014-02-07 22:43:13 -08:00
tqchen
bed2e26019 adapt tree booster 2014-02-07 22:41:32 -08:00
tqchen
5d052b9e14 adapt svdfeature tree 2014-02-07 22:38:26 -08:00
tqchen
bf36374678 add detailed comment about gbmcore 2014-02-07 20:30:39 -08:00
tqchen
1e7ac402e6 add empty folder for regression. TODO 2014-02-07 20:20:09 -08:00
tqchen
9ee1048fe9 move core code to booster 2014-02-07 20:13:27 -08:00
tqchen
0d3ecd9033 add base code 2014-02-07 18:40:53 -08:00
tqchen
4e2d67b81a sync everything 2014-02-06 21:28:47 -08:00
tqchen
51d8409e30 add config 2014-02-06 21:26:27 -08:00
tqchen
ee7643bdf6 update this folder 2014-02-06 16:06:59 -08:00
tqchen
5a2b8678fc update this folder 2014-02-06 16:06:18 -08:00
tqchen
750871a158 initial cleanup of interface 2014-02-06 16:03:04 -08:00
tqchen
aecfbf5096 init commit 2014-02-06 15:50:50 -08:00
282 changed files with 49165 additions and 205 deletions

51
.gitignore vendored
View File

@@ -2,12 +2,59 @@
*.slo
*.lo
*.o
*.page
# Compiled Dynamic libraries
*.so
*.dylib
*.page
# Compiled Static libraries
*.lai
*.la
*.a
*~
*.Rcheck
*.rds
*.tar.gz
*txt*
*conf
*buffer
*model
*pyc
*train
*test
*group
*rar
*vali
*data
*sdf
Release
*exe*
*exp
ipch
*.filters
*.user
*log
Debug
*suo
*test*
.Rhistory
*.dll
*i386
*x64
*dump
*save
*csv
.Rproj.user
*.cpage.col
*.cpage
*.Rproj
xgboost
xgboost.mpi
xgboost.mock
train*
rabit
#.Rbuildignore
R-package.Rproj
*.cache*
R-package/inst
R-package/src

36
CHANGES.md Normal file
View File

@@ -0,0 +1,36 @@
Change Log
=====
xgboost-0.1
=====
* Initial release
xgboost-0.2x
=====
* Python module
* Weighted samples instances
* Initial version of pairwise rank
xgboost-0.3
=====
* Faster tree construction module
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
* Support for boosting from initial predictions
* Experimental version of LambdaRank
* Linear booster is now parallelized, using parallel coordinated descent.
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
* Add R module
xgboost-0.4
=====
* Distributed version of xgboost that runs on YARN, scales to billions of examples
* Direct save/load data and model from/to S3 and HDFS
* Feature importance visualization in R module, by Michael Benesty
* Predict leaf index
* Poisson regression for counts data
* Early stopping option in training
* Native save load support in R and python
- xgboost models now can be saved using save/load in R
- xgboost python model is now pickable
* sklearn wrapper is supported in python module
* Experimental External memory version

209
LICENSE
View File

@@ -1,202 +1,13 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
Copyright (c) 2014 by Tianqi Chen and Contributors
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

131
Makefile Normal file
View File

@@ -0,0 +1,131 @@
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC
ifeq ($(OS), Windows_NT)
export CXX = g++ -m64
export CC = gcc -m64
endif
ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
else
CFLAGS += -fopenmp
endif
# by default use c++11
ifeq ($(cxx11),1)
CFLAGS += -std=c++11
else
endif
# handling dmlc
ifdef dmlc
ifndef config
ifneq ("$(wildcard $(dmlc)/config.mk)","")
config = $(dmlc)/config.mk
else
config = $(dmlc)/make/config.mk
endif
endif
include $(config)
include $(dmlc)/make/dmlc.mk
LDFLAGS+= $(DMLC_LDFLAGS)
LIBDMLC=$(dmlc)/libdmlc.a
else
LIBDMLC=dmlc_simple.o
endif
ifeq ($(OS), Windows_NT)
LIBRABIT = subtree/rabit/lib/librabit_empty.a
SLIB = wrapper/xgboost_wrapper.dll
else
LIBRABIT = subtree/rabit/lib/librabit.a
SLIB = wrapper/libxgboostwrapper.so
endif
# specify tensor path
BIN = xgboost
MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
MPIBIN =
TARGET = $(BIN) $(OBJ) $(SLIB)
.PHONY: clean all mpi python Rpack
all: $(BIN) $(OBJ) $(SLIB)
mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
dmlc_simple.o: src/io/dmlc_simple.cpp src/utils/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
xgboost: updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC)
wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
# dependency on rabit
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
+ cd subtree/rabit;make lib/librabit.a; cd ../..
subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
+ cd subtree/rabit;make lib/librabit_empty.a; cd ../..
subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
+ cd subtree/rabit;make lib/librabit_mock.a; cd ../..
subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
$(BIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(SLIB) :
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
install:
cp -f -r $(BIN) $(INSTALL_PATH)
Rpack:
make clean
cd subtree/rabit;make clean;cd ..
rm -rf xgboost xgboost*.tar.gz
cp -r R-package xgboost
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
rm -rf xgboost/src/*/*.o
rm -rf subtree/rabit/src/*.o
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
rm -rf xgboost/demo/runall.R
cp -r src xgboost/src/src
mkdir xgboost/src/subtree
mkdir xgboost/src/subtree/rabit
cp -r subtree/rabit/include xgboost/src/subtree/rabit/include
cp -r subtree/rabit/src xgboost/src/subtree/rabit/src
rm -rf xgboost/src/subtree/rabit/src/*.o
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
cp ./LICENSE xgboost
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
cp xgboost/src/Makevars xgboost/src/Makevars.win
# R CMD build --no-build-vignettes xgboost
R CMD build xgboost
rm -rf xgboost
R CMD check --as-cran xgboost*.tar.gz
clean:
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
cd subtree/rabit; make clean; cd ..

5
R-package/.Rbuildignore Normal file
View File

@@ -0,0 +1,5 @@
\.o$
\.so$
\.dll$
^.*\.Rproj$
^\.Rproj\.user$

34
R-package/DESCRIPTION Normal file
View File

@@ -0,0 +1,34 @@
Package: xgboost
Type: Package
Title: eXtreme Gradient Boosting
Version: 0.4-0
Date: 2015-05-11
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
Maintainer: Tong He <hetong007@gmail.com>
Description: Xgboost is short for eXtreme Gradient Boosting, which is an
efficient and scalable implementation of gradient boosting framework.
This package is an R wrapper of xgboost. The package includes efficient
linear model solver and tree learning algorithms. The package can automatically
do parallel computation with OpenMP, and it can be more than 10 times faster
than existing gradient boosting packages such as gbm. It supports various
objective functions, including regression, classification and ranking. The
package is made to be extensible, so that users are also allowed to define
their own objectives easily.
License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/dmlc/xgboost
BugReports: https://github.com/dmlc/xgboost/issues
VignetteBuilder: knitr
Suggests:
knitr,
ggplot2 (>= 1.0.0),
DiagrammeR (>= 0.6),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3)
Depends:
R (>= 2.10)
Imports:
Matrix (>= 1.1-0),
methods,
data.table (>= 1.9.4),
magrittr (>= 1.5),
stringr (>= 0.6.2)

13
R-package/LICENSE Normal file
View File

@@ -0,0 +1,13 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

43
R-package/NAMESPACE Normal file
View File

@@ -0,0 +1,43 @@
# Generated by roxygen2 (4.1.1): do not edit by hand
export(getinfo)
export(setinfo)
export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.save)
export(xgb.cv)
export(xgb.dump)
export(xgb.importance)
export(xgb.load)
export(xgb.model.dt.tree)
export(xgb.plot.importance)
export(xgb.plot.tree)
export(xgb.save)
export(xgb.save.raw)
export(xgb.train)
export(xgboost)
exportMethods(nrow)
exportMethods(predict)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(Matrix,cBind)
importFrom(Matrix,colSums)
importFrom(Matrix,sparseVector)
importFrom(data.table,":=")
importFrom(data.table,as.data.table)
importFrom(data.table,copy)
importFrom(data.table,data.table)
importFrom(data.table,fread)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,setnames)
importFrom(magrittr,"%>%")
importFrom(magrittr,add)
importFrom(magrittr,not)
importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all)
importFrom(stringr,str_match)
importFrom(stringr,str_replace)
importFrom(stringr,str_split)
importFrom(stringr,str_trim)

View File

@@ -0,0 +1,57 @@
setClass('xgb.DMatrix')
#' Get information of an xgb.DMatrix object
#'
#' Get information of an xgb.DMatrix object
#'
#' The information can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label Xgboost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname getinfo
#' @export
#'
getinfo <- function(object, ...){
UseMethod("getinfo")
}
#' @param object Object of class \code{xgb.DMatrix}
#' @param name the name of the field to get
#' @param ... other parameters
#' @rdname getinfo
#' @method getinfo xgb.DMatrix
setMethod("getinfo", signature = "xgb.DMatrix",
definition = function(object, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(object) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name != "label" && name != "weight" &&
name != "base_margin" && name != "nrow") {
stop(paste("xgb.getinfo: unknown info name", name))
}
if (name != "nrow"){
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
} else {
ret <- xgb.numrow(object)
}
return(ret)
})

View File

@@ -0,0 +1,19 @@
setGeneric("nrow")
#' @title Number of xgb.DMatrix rows
#' @description \code{nrow} return the number of rows present in the \code{xgb.DMatrix}.
#' @param x Object of class \code{xgb.DMatrix}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' stopifnot(nrow(dtrain) == nrow(train$data))
#'
#' @export
setMethod("nrow",
signature = "xgb.DMatrix",
definition = function(x) {
xgb.numrow(x)
}
)

View File

@@ -0,0 +1,75 @@
setClass("xgb.Booster.handle")
setClass("xgb.Booster",
slots = c(handle = "xgb.Booster.handle",
raw = "raw"))
#' Predict method for eXtreme Gradient Boosting model
#'
#' Predicted values based on xgboost model object.
#'
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
#' output value before logistic transformation.
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#' @export
#'
setMethod("predict", signature = "xgb.Booster",
definition = function(object, newdata, missing = NULL,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
if (class(object) != "xgb.Booster"){
stop("predict: model in prediction must be of class xgb.Booster")
} else {
object <- xgb.Booster.check(object, saveraw = FALSE)
}
if (class(newdata) != "xgb.DMatrix") {
if (is.null(missing)) {
newdata <- xgb.DMatrix(newdata)
} else {
newdata <- xgb.DMatrix(newdata, missing = missing)
}
}
if (is.null(ntreelimit)) {
ntreelimit <- 0
} else {
if (ntreelimit < 1){
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
option = 0
if (outputmargin) {
option <- option + 1
}
if (predleaf) {
option <- option + 2
}
ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option),
as.integer(ntreelimit), PACKAGE = "xgboost")
if (predleaf){
len <- getinfo(newdata, "nrow")
if (length(ret) == len){
ret <- matrix(ret,ncol = 1)
} else {
ret <- matrix(ret, ncol = len)
ret <- t(ret)
}
}
return(ret)
})

View File

@@ -0,0 +1,19 @@
#' Predict method for eXtreme Gradient Boosting model handle
#'
#' Predicted values based on xgb.Booster.handle object.
#'
#' @param object Object of class "xgb.Boost.handle"
#' @param ... Parameters pass to \code{predict.xgb.Booster}
#'
setMethod("predict", signature = "xgb.Booster.handle",
definition = function(object, ...) {
if (class(object) != "xgb.Booster.handle"){
stop("predict: model in prediction must be of class xgb.Booster.handle")
}
bst <- xgb.handleToBooster(object)
ret = predict(bst, ...)
return(ret)
})

View File

@@ -0,0 +1,38 @@
#' Set information of an xgb.DMatrix object
#'
#' Set information of an xgb.DMatrix object
#'
#' It can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label Xgboost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
#' \item \code{group}.
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname setinfo
#' @export
#'
setinfo <- function(object, ...){
UseMethod("setinfo")
}
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
#' @rdname setinfo
#' @method setinfo xgb.DMatrix
setMethod("setinfo", signature = "xgb.DMatrix",
definition = function(object, name, info) {
xgb.setinfo(object, name, info)
})

View File

@@ -0,0 +1,45 @@
setClass('xgb.DMatrix')
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' dsub <- slice(dtrain, 1:3)
#' @rdname slice
#' @export
#'
slice <- function(object, ...){
UseMethod("slice")
}
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param ... other parameters
#' @rdname slice
#' @method slice xgb.DMatrix
setMethod("slice", signature = "xgb.DMatrix",
definition = function(object, idxset, ...) {
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
PACKAGE = "xgboost")
attr_list <- attributes(object)
nr <- xgb.numrow(object)
len <- sapply(attr_list,length)
ind <- which(len==nr)
if (length(ind)>0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix"))
})

344
R-package/R/utils.R Normal file
View File

@@ -0,0 +1,344 @@
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @import methods
# depends on matrix
.onLoad <- function(libname, pkgname) {
library.dynam("xgboost", pkgname, libname)
}
.onUnload <- function(libpath) {
library.dynam.unload("xgboost", libpath)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name == "label") {
if (length(info)!=xgb.numrow(dmat))
stop("The length of labels must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "weight") {
if (length(info)!=xgb.numrow(dmat))
stop("The length of weights must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "base_margin") {
# if (length(info)!=xgb.numrow(dmat))
# stop("The length of base margin must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "group") {
if (sum(info)!=xgb.numrow(dmat))
stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE)
}
stop(paste("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
PACKAGE = "xgboost")
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") {
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
} else if (typeof(modelfile) == "raw") {
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
} else {
stop("xgb.Booster: modelfile must be character or raw vector")
}
}
return(structure(handle, class = "xgb.Booster.handle"))
}
# convert xgb.Booster.handle to xgb.Booster
xgb.handleToBooster <- function(handle, raw = NULL)
{
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
# Check whether an xgb.Booster object is complete
xgb.Booster.check <- function(bst, saveraw = TRUE)
{
isnull <- is.null(bst$handle)
if (!isnull) {
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
}
if (isnull) {
bst$handle <- xgb.Booster(modelfile = bst$raw)
} else {
if (is.null(bst$raw) && saveraw)
bst$raw <- xgb.save.raw(bst$handle)
}
return(bst)
}
## ----the following are low level iteratively function, not needed if
## you do not want to use them ---------------------------------------
# get dmatrix from data, label
xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
}
if (is.null(missing)){
dtrain <- xgb.DMatrix(data, label = label)
} else {
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (inClass == "character") {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
dtrain <- data
} else {
stop("xgboost: Invalid input of data")
}
}
return (dtrain)
}
xgb.numrow <- function(dmat) {
nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost")
return(nrow)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster.handle") {
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess,
PACKAGE = "xgboost")
return(TRUE)
}
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
if (class(booster) != "xgb.Booster.handle") {
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
if (is.null(obj)) {
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
PACKAGE = "xgboost")
} else {
pred <- predict(booster, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(booster, dtrain, gpair)
}
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
if (class(booster) != "xgb.Booster.handle") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
if (typeof(watchlist) != "list") {
stop("xgb.eval: only accepts list of DMatrix as watchlist")
}
for (w in watchlist) {
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix")
}
}
if (length(watchlist) != 0) {
if (is.null(feval)) {
evnames <- list()
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
evnames, PACKAGE = "xgboost")
} else {
msg <- paste("[", iter, "]", sep="")
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
preds <- predict(booster, w[[1]])
ret <- feval(preds, w[[1]])
msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="")
}
}
} else {
msg <- ""
}
if (prediction){
preds <- predict(booster,watchlist[[2]])
return(list(msg,preds))
}
return(msg)
}
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
if(is.null(folds)) {
if (exists('objective', where=param) && strtrim(param[['objective']], 5) == 'rank:') {
stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n",
"\tConsider providing pre-computed CV-folds through the folds parameter.")
}
y <- getinfo(dall, 'label')
randidx <- sample(1 : xgb.numrow(dall))
if (stratified & length(y) == length(randidx)) {
y <- y[randidx]
#
# WARNING: some heuristic logic is employed to identify classification setting!
#
# For classification, need to convert y labels to factor before making the folds,
# and then do stratification by factor levels.
# For regression, leave y numeric and do stratification by quantiles.
if (exists('objective', where=param)) {
# If 'objective' provided in params, assume that y is a classification label
# unless objective is reg:linear
if (param[['objective']] != 'reg:linear') y <- factor(y)
} else {
# If no 'objective' given in params, it means that user either wants to use
# the default 'reg:linear' objective or has provided a custom obj function.
# Here, assume classification setting when y has 5 or less unique values:
if (length(unique(y)) <= 5) y <- factor(y)
}
folds <- xgb.createFolds(y, nfold)
} else {
# make simple non-stratified folds
kstep <- length(randidx) %/% nfold
folds <- list()
for (i in 1:(nfold-1)) {
folds[[i]] = randidx[1:kstep]
randidx = setdiff(randidx, folds[[i]])
}
folds[[nfold]] = randidx
}
}
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, folds[[k]])
didx = c()
for (i in 1:nfold) {
if (i != k) {
didx <- append(didx, folds[[i]])
}
}
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
}
return (ret)
}
xgb.cv.aggcv <- function(res, showsd = TRUE) {
header <- res[[1]]
ret <- header[1]
for (i in 2:length(header)) {
kv <- strsplit(header[i], ":")[[1]]
ret <- paste(ret, "\t", kv[1], ":", sep="")
stats <- c()
stats[1] <- as.numeric(kv[2])
for (j in 2:length(res)) {
tkv <- strsplit(res[[j]][i], ":")[[1]]
stats[j] <- as.numeric(tkv[2])
}
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
if (showsd) {
ret <- paste(ret, sprintf("+%f", sd(stats)), sep="")
}
}
return (ret)
}
# Shamelessly copied from caret::createFolds
# and simplified by always returning an unnamed list of test indices
xgb.createFolds <- function(y, k = 10)
{
if(is.numeric(y)) {
## Group the numeric data based on their magnitudes
## and sample within those groups.
## When the number of samples is low, we may have
## issues further slicing the numeric data into
## groups. The number of groups will depend on the
## ratio of the number of folds to the sample size.
## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified
## CV
cuts <- floor(length(y)/k)
if(cuts < 2) cuts <- 2
if(cuts > 5) cuts <- 5
y <- cut(y,
unique(quantile(y, probs = seq(0, 1, length = cuts))),
include.lowest = TRUE)
}
if(k < length(y)) {
## reset levels so that the possible levels and
## the levels in the vector are the same
y <- factor(as.character(y))
numInClass <- table(y)
foldVector <- vector(mode = "integer", length(y))
## For each class, balance the fold allocation as far
## as possible, then resample the remainder.
## The final assignment of folds is also randomized.
for(i in 1:length(numInClass)) {
## create a vector of integers from 1:k as many times as possible without
## going over the number of samples in the class. Note that if the number
## of samples in a class is less than k, nothing is producd here.
seqVector <- rep(1:k, numInClass[i] %/% k)
## add enough random integers to get length(seqVector) == numInClass[i]
if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
## shuffle the integers for fold assignment and assign to this classes's data
foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
}
} else foldVector <- seq(along = y)
out <- split(seq(along = y), foldVector)
names(out) <- NULL
out
}

45
R-package/R/xgb.DMatrix.R Normal file
View File

@@ -0,0 +1,45 @@
#' Contruct xgb.DMatrix object
#'
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
#'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
#'
xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
PACKAGE = "xgboost")
} else if (is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
PACKAGE = "xgboost")
} else if (class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
PACKAGE = "xgboost")
} else {
stop(paste("xgb.DMatrix: does not support to construct from ",
typeof(data)))
}
dmat <- structure(handle, class = "xgb.DMatrix")
info <- append(info, list(...))
if (length(info) == 0)
return(dmat)
for (i in 1:length(info)) {
p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]])
}
return(dmat)
}

View File

@@ -0,0 +1,27 @@
#' Save xgb.DMatrix object to binary file
#'
#' Save xgb.DMatrix object to binary file
#'
#' @param DMatrix the DMatrix object
#' @param fname the name of the binary file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
#'
xgb.DMatrix.save <- function(DMatrix, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(DMatrix) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE),
PACKAGE = "xgboost")
return(TRUE)
}
stop("xgb.DMatrix.save: the input must be xgb.DMatrix")
return(FALSE)
}

232
R-package/R/xgb.cv.R Normal file
View File

@@ -0,0 +1,232 @@
#' Cross Validation
#'
#' The cross valudation function of xgboost
#'
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_replace
#' @importFrom stringr str_match
#'
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \link{xgb.train} for further details.
#' See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
#' @param nrounds the max number of iterations
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label option field, when data is \code{Matrix}
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
#' Possible options are:
#' \itemize{
#' \item \code{error} binary classification error rate
#' \item \code{rmse} Rooted mean square error
#' \item \code{logloss} negative log-likelihood function
#' \item \code{auc} Area under curve
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
#' }
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain.
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain.
#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
#' If folds are supplied, the nfold and stratified parameters would be ignored.
#' @param verbose \code{boolean}, print the statistics during the process
#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param early.stop.round An alternative of \code{early_stop_round}.
#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#'
#' @param ... other parameters to pass to \code{params}.
#'
#' @return
#' If \code{prediction = TRUE}, a list with the following elements is returned:
#' \itemize{
#' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
#' \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
#' }
#'
#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
#'
#' @details
#' The original sample is randomly partitioned into \code{nfold} equal size subsamples.
#'
#' Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
#'
#' The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
#'
#' All observations are used for both training and validation.
#'
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
#' max.depth =3, eta = 1, objective = "binary:logistic")
#' print(history)
#' @export
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,
early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
if(!is.null(folds)) {
if(class(folds)!="list" | length(folds) < 2) {
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
}
nfold <- length(folds)
}
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
if (is.null(missing)) {
dtrain <- xgb.get.DMatrix(data, label)
} else {
dtrain <- xgb.get.DMatrix(data, label, missing)
}
params <- append(params, list(...))
params <- append(params, list(silent=1))
for (mc in metrics) {
params <- append(params, list("eval_metric"=mc))
}
# Early Stopping
if (is.null(early_stop_round) && !is.null(early.stop.round))
early_stop_round = early.stop.round
if (!is.null(early_stop_round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize = FALSE
} else {
maximize = TRUE
}
}
if (maximize) {
bestScore = 0
} else {
bestScore = Inf
}
bestInd = 0
earlyStopflag = FALSE
if (length(metrics)>1)
warning('Only the first metric is used for early stopping process.')
}
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
obj_type = params[['objective']]
mat_pred = FALSE
if (!is.null(obj_type) && obj_type=='multi:softprob')
{
num_class = params[['num_class']]
if (is.null(num_class))
stop('must set num_class to use softmax')
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
mat_pred = TRUE
}
else
predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- xgb_folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
if (i<nrounds) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
if (!prediction) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
if (mat_pred) {
pred_mat = matrix(res[[2]],num_class,length(fd$index))
predictValues[fd$index,] <- t(pred_mat)
} else {
predictValues[fd$index] <- res[[2]]
}
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
}
}
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
if(verbose) paste(ret, "\n", sep="") %>% cat
# early_Stopping
if (!is.null(early_stop_round)){
score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1]
score = strsplit(score,'\\+|:')[[1]][[2]]
score = as.numeric(score)
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
bestScore = score
bestInd = i
} else {
if (i-bestInd>=early_stop_round) {
earlyStopflag = TRUE
cat('Stopping. Best iteration:',bestInd)
break
}
}
}
}
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
colnamesMean <- paste(colnames, "mean")
if(showsd) colnamesStd <- paste(colnames, "std")
colnames <- c()
if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
else colnames <- colnamesMean
type <- rep(x = "numeric", times = length(colnames))
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
split <- str_split(string = history, pattern = "\t")
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
if (prediction) {
return(list(dt = dt,pred = predictValues))
}
return(dt)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")

71
R-package/R/xgb.dump.R Normal file
View File

@@ -0,0 +1,71 @@
#' Save xgboost model to text file
#'
#' Save a xgboost model to text file. Could be parsed later.
#'
#' @importFrom magrittr %>%
#' @importFrom stringr str_replace
#' @importFrom data.table fread
#' @importFrom data.table :=
#' @importFrom data.table setnames
#' @param model the model object.
#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
#' @param fmap feature map file representing the type of feature.
#' Detailed description could be found at
#' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
#' See demo/ for walkthrough example in R, and
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
#' for example Format.
#' @param with.stats whether dump statistics of splits
#' When this option is on, the model dump comes with two additional statistics:
#' gain is the approximate loss function gain we get in each split;
#' cover is the sum of second order gradient in each node.
#'
#' @return
#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' # save the model in file 'xgb.model.dump'
#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#'
#' # print the model without saving it to a file
#' print(xgb.dump(bst))
#' @export
#'
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
if (class(model) != "xgb.Booster") {
stop("model: argument must be type xgb.Booster")
} else {
model <- xgb.Booster.check(model)
}
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fname: argument must be type character (when provided)")
}
if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fmap: argument must be type character (when provided)")
}
longString <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with.stats), PACKAGE = "xgboost")
dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F)
setnames(dt, "Lines")
if(is.null(fname)) {
result <- dt[Lines != "0"][, Lines := str_replace(Lines, "^\t+", "")][Lines != ""][, paste(Lines)]
return(result)
} else {
result <- dt[Lines != "0"][Lines != ""][, paste(Lines)] %>% writeLines(fname)
return(TRUE)
}
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Lines", "."))

View File

@@ -0,0 +1,134 @@
#' Show importance of features in a model
#'
#' Read a xgboost model text dump.
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
#'
#' @importFrom data.table data.table
#' @importFrom data.table setnames
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @importFrom Matrix colSums
#' @importFrom Matrix cBind
#' @importFrom Matrix sparseVector
#'
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#'
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
#'
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#'
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
#'
#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
#'
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
#'
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#'
#' @details
#' This is the function to understand the model trained (and through your model, your data).
#'
#' Results are returned for both linear and tree models.
#'
#' \code{data.table} is returned by the function.
#' There are 3 columns :
#' \itemize{
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
#' }
#'
#' Co-occurence count
#' ------------------
#'
#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
#'
#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
#'
#' If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' # Both dataset are list with two items, a sparse matrix and labels
#' # (labels = outcome column which will be learned).
#' # Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.importance(train$data@@Dimnames[[2]], model = bst)
#'
#' # Same thing with co-occurence computation this time
#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
#'
#' @export
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ((x + label) == 2)){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a path to the model dump file.")
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if((is.null(data) & !is.null(label)) |(!is.null(data) & is.null(label))) {
stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
}
if(class(label) == "numeric"){
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
}
if(is.null(model)){
text <- readLines(filename_dump)
} else {
text <- xgb.dump(model = model, with.stats = T)
}
if(text[2] == "bias:"){
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
} else {
result <- treeDump(feature_names, text = text, keepDetail = !is.null(data))
# Co-occurence computation
if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
# Take care of missing column
a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0
# Bind the two Matrix and reorder columns
c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]]
rm(a)
# Apply split
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
apply(c & d, 2, . %>% target %>% sum) -> vec
result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL]
}
}
result
}
treeDump <- function(feature_names, text, keepDetail){
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)]
result
}
linearDump <- function(feature_names, text){
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c(".", "Feature", "Split", "No", "Missing", "MissingNo", "RealCover"))

32
R-package/R/xgb.load.R Normal file
View File

@@ -0,0 +1,32 @@
#' Load xgboost model from binary file
#'
#' Load xgboost model from the binary model file
#'
#' @param modelfile the name of the binary file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model')
#' pred <- predict(bst, test$data)
#' @export
#'
xgb.load <- function(modelfile) {
if (is.null(modelfile))
stop("xgb.load: modelfile cannot be NULL")
handle <- xgb.Booster(modelfile = modelfile)
# re-use modelfile if it is raw so we donot need to serialize
if (typeof(modelfile) == "raw") {
bst <- xgb.handleToBooster(handle, modelfile)
} else {
bst <- xgb.handleToBooster(handle, NULL)
}
bst <- xgb.Booster.check(bst)
return(bst)
}

View File

@@ -0,0 +1,170 @@
#' Convert tree model dump to data.table
#'
#' Read a tree model text dump and return a data.table.
#'
#' @importFrom data.table data.table
#' @importFrom data.table set
#' @importFrom data.table rbindlist
#' @importFrom data.table copy
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @importFrom magrittr not
#' @importFrom magrittr add
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_extract
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
#'
#' @details
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
#'
#' The content of the \code{data.table} is organised that way:
#'
#' \itemize{
#' \item \code{ID}: unique identifier of a node ;
#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
#' \item \code{Split}: value of the chosen feature where is operated the split ;
#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
stop("filename_dump: path to the model doesn't exist.")
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!class(text) %in% c("character", "NULL")) {
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
}
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
stop("n_first_tree: Has to be a numeric vector of size 1.")
}
if(!is.null(model)){
text = xgb.dump(model = model, with.stats = T)
} else if(!is.null(filename_dump)){
text <- readLines(filename_dump) %>% str_trim(side = "both")
}
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
n_round <- min(length(position) - 1, n_first_tree)
addTreeId <- function(x, i) paste(i,x,sep = "-")
allTrees <- data.table()
anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
for(i in 1:n_round){
tree <- text[(position[i]+1):(position[i+1]-1)]
# avoid tree made of a leaf only (no split)
if(length(tree) <2) next
treeID <- i-1
notLeaf <- str_match(tree, "leaf") %>% is.na
leaf <- notLeaf %>% not %>% tree[.]
branch <- notLeaf %>% tree[.]
idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
if(!is.null(feature_names)){
featureBranch <- feature_names[featureBranch + 1]
}
featureLeaf <- rep("Leaf", length(leaf))
splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
splitLeaf <- rep(NA, length(leaf))
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
yesLeaf <- rep(NA, length(leaf))
noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
noLeaf <- rep(NA, length(leaf))
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
missingLeaf <- rep(NA, length(leaf))
qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
}
yes <- allTrees[!is.na(Yes),Yes]
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "Yes.Feature",
value = allTrees[ID == yes,Feature])
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "Yes.Cover",
value = allTrees[ID == yes,Cover])
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "Yes.Quality",
value = allTrees[ID == yes,Quality])
no <- allTrees[!is.na(No),No]
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "No.Feature",
value = allTrees[ID == no,Feature])
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "No.Cover",
value = allTrees[ID == no,Cover])
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "No.Quality",
value = allTrees[ID == no,Quality])
allTrees
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))

View File

@@ -0,0 +1,57 @@
#' Plot feature importance bar graph
#'
#' Read a data.table containing feature importance details and plot it.
#'
#' @importFrom magrittr %>%
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
#' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
#'
#' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
#'
#' @details
#' The purpose of this function is to easily represent the importance of each feature of a model.
#' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst)
#' xgb.plot.importance(importance_matrix)
#'
#' @export
xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
if (!require(ggplot2, quietly = TRUE)) {
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
}
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
}
# To avoid issues in clustering when co-occurences are used
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
return(plot)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))

View File

@@ -0,0 +1,97 @@
#' Plot a boosted tree model
#'
#' Read a tree model text dump.
#' Plotting only works for boosted tree model (not linear model).
#'
#' @importFrom data.table data.table
#' @importFrom data.table set
#' @importFrom data.table rbindlist
#' @importFrom data.table :=
#' @importFrom data.table copy
#' @importFrom magrittr %>%
#' @importFrom magrittr not
#' @importFrom magrittr add
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_extract
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
#' @param width the width of the diagram in pixels.
#' @param height the height of the diagram in pixels.
#'
#' @return A \code{DiagrammeR} of the model.
#'
#' @details
#'
#' The content of each node is organised that way:
#'
#' \itemize{
#' \item \code{feature} value ;
#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
#' \item \code{gain}: metric the importance of the node in the model.
#' }
#'
#' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
#'
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){
if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
stop("style: Has to be a character vector of size 1.")
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
}
if(is.null(model)){
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
} else {
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
}
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
if(is.null(CSSstyle)){
CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
}
yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
DiagrammeR::mermaid(path, width, height)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))

32
R-package/R/xgb.save.R Normal file
View File

@@ -0,0 +1,32 @@
#' Save xgboost model to binary file
#'
#' Save xgboost model from xgboost or xgb.train
#'
#' @param model the model object.
#' @param fname the name of the binary file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model')
#' pred <- predict(bst, test$data)
#' @export
#'
xgb.save <- function(model, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(model) == "xgb.Booster") {
model <- xgb.Booster.check(model)
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
return(TRUE)
}
stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
xgb.DMatrix object.")
return(FALSE)
}

View File

@@ -0,0 +1,30 @@
#' Save xgboost model to R's raw vector,
#' user can call xgb.load to load the model back from raw vector
#'
#' Save xgboost model from xgboost or xgb.train
#'
#' @param model the model object.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' raw <- xgb.save.raw(bst)
#' bst <- xgb.load(raw)
#' pred <- predict(bst, test$data)
#' @export
#'
xgb.save.raw <- function(model) {
if (class(model) == "xgb.Booster"){
model <- model$handle
}
if (class(model) == "xgb.Booster.handle") {
raw <- .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost")
return(raw)
}
stop("xgb.raw: the input must be xgb.Booster.handle. Use xgb.DMatrix.save to save
xgb.DMatrix object.")
}

208
R-package/R/xgb.train.R Normal file
View File

@@ -0,0 +1,208 @@
#' eXtreme Gradient Boosting Training
#'
#' An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
#'
#' @param params the list of parameters.
#'
#' 1. General Parameters
#'
#' \itemize{
#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
#' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
#' }
#'
#' 2. Booster Parameters
#'
#' 2.1. Parameter for Tree Booster
#'
#' \itemize{
#' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
#' \item \code{max_depth} maximum depth of a tree. Default: 6
#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
#' }
#'
#' 2.2. Parameter for Linear Booster
#'
#' \itemize{
#' \item \code{lambda} L2 regularization term on weights. Default: 0
#' \item \code{lambda_bias} L2 regularization term on bias. Default: 0
#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
#' }
#'
#' 3. Task Parameters
#'
#' \itemize{
#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
#' \itemize{
#' \item \code{reg:linear} linear regression (Default).
#' \item \code{reg:logistic} logistic regression.
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
#' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#' }
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#' \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
#' }
#'
#' @param data takes an \code{xgb.DMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param watchlist what information should be printed when \code{verbose=1} or
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
#' during training. For example user can specify
#' watchlist=list(validation1=mat1, validation2=mat2) to watch
#' the performance of each round's model on mat1 and mat2
#'
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain,
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param early.stop.round An alternative of \code{early_stop_round}.
#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the training function for \code{xgboost}.
#'
#' It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
#' therefore it is more flexible than \code{\link{xgboost}} function.
#'
#' Parallelization is automatically enabled if \code{OpenMP} is present.
#' Number of threads can also be manually specified via \code{nthread} parameter.
#'
#' \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
#' \itemize{
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
#' }
#'
#' Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
#'
#' This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' dtest <- dtrain
#' watchlist <- list(eval = dtest, train = dtrain)
#' param <- list(max.depth = 2, eta = 1, silent = 1)
#' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds))
#' grad <- preds - labels
#' hess <- preds * (1 - preds)
#' return(list(grad = grad, hess = hess))
#' }
#' evalerror <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
#' return(list(metric = "error", value = err))
#' }
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
#' @export
#'
xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, printEveryN=1L,
early_stop_round = NULL, early.stop.round = NULL,
maximize = NULL, ...) {
dtrain <- data
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix")
}
if (verbose > 1) {
params <- append(params, list(silent = 0))
} else {
params <- append(params, list(silent = 1))
}
if (length(watchlist) != 0 && verbose == 0) {
warning('watchlist is provided but verbose=0, no evaluation information will be printed')
watchlist <- list()
}
params = append(params, list(...))
# Early stopping
if (is.null(early_stop_round) && !is.null(early.stop.round))
early_stop_round = early.stop.round
if (!is.null(early_stop_round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (length(watchlist) == 0)
stop('For early stopping you need at least one set in watchlist.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize = FALSE
} else {
maximize = TRUE
}
}
if (maximize) {
bestScore = 0
} else {
bestScore = Inf
}
bestInd = 0
earlyStopflag = FALSE
if (length(watchlist)>1)
warning('Only the first data set in watchlist is used for early stopping process.')
}
handle <- xgb.Booster(params, append(watchlist, dtrain))
bst <- xgb.handleToBooster(handle)
printEveryN=max( as.integer(printEveryN), 1L)
for (i in 1:nrounds) {
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
if (length(watchlist) != 0) {
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
if (0== ( (i-1) %% printEveryN))
cat(paste(msg, "\n", sep=""))
if (!is.null(early_stop_round))
{
score = strsplit(msg,':|\\s+')[[1]][3]
score = as.numeric(score)
if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
bestScore = score
bestInd = i
} else {
if (i-bestInd>=early_stop_round) {
earlyStopflag = TRUE
cat('Stopping. Best iteration:',bestInd)
break
}
}
}
}
}
bst <- xgb.Booster.check(bst)
if (!is.null(early_stop_round)) {
bst$bestScore = bestScore
bst$bestInd = bestInd
}
return(bst)
}

137
R-package/R/xgboost.R Normal file
View File

@@ -0,0 +1,137 @@
#' eXtreme Gradient Boosting (Tree) library
#'
#' A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface.
#'
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field,
#' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters.
#'
#' Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
#'
#' See also \code{demo/} for walkthrough example in R.
#'
#' @param nrounds the max number of iterations
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
#' @param printEveryN Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param early.stop.round An alternative of \code{early_stop_round}.
#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the modeling function for Xgboost.
#'
#' Parallelization is automatically enabled if \code{OpenMP} is present.
#'
#' Number of threads can also be manually specified via \code{nthread} parameter.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#'
#' @export
#'
xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds,
verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL,
maximize = NULL, ...) {
if (is.null(missing)) {
dtrain <- xgb.get.DMatrix(data, label)
} else {
dtrain <- xgb.get.DMatrix(data, label, missing)
}
params <- append(params, list(...))
if (verbose > 0) {
watchlist <- list(train = dtrain)
} else {
watchlist <- list()
}
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, printEveryN=printEveryN,
early_stop_round = early_stop_round,
early.stop.round = early.stop.round)
return(bst)
}
#' Training part from Mushroom Data Set
#'
#' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository.
#'
#' This data set includes the following fields:
#'
#' \itemize{
#' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
#' }
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science.
#'
#' @docType data
#' @keywords datasets
#' @name agaricus.train
#' @usage data(agaricus.train)
#' @format A list containing a label vector, and a dgCMatrix object with 6513
#' rows and 127 variables
NULL
#' Test part from Mushroom Data Set
#'
#' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository.
#'
#' This data set includes the following fields:
#'
#' \itemize{
#' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
#' }
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science.
#'
#' @docType data
#' @keywords datasets
#' @name agaricus.test
#' @usage data(agaricus.test)
#' @format A list containing a label vector, and a dgCMatrix object with 1611
#' rows and 126 variables
NULL

20
R-package/README.md Normal file
View File

@@ -0,0 +1,20 @@
# R package for xgboost.
## Installation
For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
```r
devtools::install_github('dmlc/xgboost',subdir='R-package')
```
For stable version on CRAN, please run
```r
install.packages('xgboost')
```
## Examples
* Please visit [walk through example](demo).
* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).

Binary file not shown.

Binary file not shown.

10
R-package/demo/00Index Normal file
View File

@@ -0,0 +1,10 @@
basic_walkthrough Basic feature walkthrough
custom_objective Cutomize loss function, and evaluation metric
boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
generalized_linear_model Generalized Linear Model
cross_validation Cross validation
create_sparse_matrix Create Sparse Matrix
predict_leaf_indices Predicting the corresponding leaves
early_stopping Early Stop in training
poisson_regression Poisson Regression on count data

18
R-package/demo/README.md Normal file
View File

@@ -0,0 +1,18 @@
XGBoost R Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Cutomize loss function, and evaluation metric](custom_objective.R)
* [Boosting from existing prediction](boost_from_prediction.R)
* [Predicting using first n trees](predict_first_ntree.R)
* [Generalized Linear Model](generalized_linear_model.R)
* [Cross validation](cross_validation.R)
* [Create a sparse matrix from a dense one](create_sparse_matrix.R)
Benchmarks
====
* [Starter script for Kaggle Higgs Boson](../../demo/kaggle-higgs)
Notes
====
* Contribution of examples, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request:)

View File

@@ -0,0 +1,105 @@
require(xgboost)
require(methods)
# we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom can be eated
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
class(train$label)
class(train$data)
#-------------Basic Training using XGBoost-----------------
# this is the basic usage of xgboost you can put matrix in data field
# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
print("training xgboost with sparseMatrix")
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic")
# alternatively, you can put in dense matrix, i.e. basic R-matrix
print("training xgboost with Matrix")
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic")
# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
print("training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2,
objective = "binary:logistic")
# Verbose = 0,1,2
print ('train xgboost with verbose 0, no message')
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic", verbose = 0)
print ('train xgboost with verbose 1, print evaluation metric')
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic", verbose = 1)
print ('train xgboost with verbose 2, also print information about tree')
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic", verbose = 2)
# you can also specify data as file path to a LibSVM format input
# since we do not have this file with us, the following line is just for illustration
# bst <- xgboost(data = 'agaricus.train.svm', max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
#--------------------basic prediction using xgboost--------------
# you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
#-------------------save and load models-------------------------
# save model to binary local file
xgb.save(bst, "xgboost.model")
# load binary model to R
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
# save model to R's raw vector
raw = xgb.save.raw(bst)
# load binary model to R
bst3 <- xgb.load(raw)
pred3 <- predict(bst3, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
#----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
#---------------Using watchlist----------------
# watchlist is a list of xgb.DMatrix, each of them tagged with name
watchlist <- list(train=dtrain, test=dtest)
# to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list
print ('train xgboost using xgb.train with watchlist')
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics
print ('train xgboost using xgb.train with watchlist, watch logloss and error')
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
eval.metric = "error", eval.metric = "logloss",
nthread = 2, objective = "binary:logistic")
# xgb.DMatrix can also be saved using xgb.DMatrix.save
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo
label = getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err))
# You can dump the tree you learned using xgb.dump into a text file
xgb.dump(bst, "dump.raw.txt", with.stats = T)
# Finally, you can check which features are the most important.
print("Most important features (look at column Gain):")
print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt"))

View File

@@ -0,0 +1,26 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
watchlist <- list(eval = dtest, train = dtrain)
###
# advanced: start from a initial base prediction
#
print('start running example to start from a initial prediction')
# train xgboost for 1 round
param <- list(max.depth=2,eta=1,nthread = 2, silent=1,objective='binary:logistic')
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
ptest <- predict(bst, dtest, outputmargin=TRUE)
# set the base_margin property of dtrain and dtest
# base margin is the base prediction we will boost from
setinfo(dtrain, "base_margin", ptrain)
setinfo(dtest, "base_margin", ptest)
print('this is result of boost from initial prediction')
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@@ -0,0 +1,89 @@
require(xgboost)
require(Matrix)
require(data.table)
if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
# According to its documentation, Xgboost works only on numbers.
# Sometimes the dataset we have to work on have categorical data.
# A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
#
# In R, categorical variable is called Factor.
# Type ?factor in console for more information.
#
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
# The method we are going to see is usually called "one hot encoding".
#load Arthritis dataset in memory.
data(Arthritis)
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = F)
# Let's have a look to the data.table
cat("Print the dataset\n")
print(df)
# 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
cat("Structure of the dataset\n")
str(df)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
# List the different values for the column Treatment: Placebo, Treated.
cat("Values of the categorical feature Treatment\n")
print(levels(df[,Treatment]))
# Next step, we will transform the categorical data to dummy variables.
# This method is also called one hot encoding.
# The purpose is to transform each value of each categorical feature in one binary feature.
#
# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary. For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, the value 1 in the new column Placebo and the value 0 in the new column Treated.
#
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
# Column Improved is excluded because it will be our output column, the one we want to predict.
sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
cat("Encoding of the sparse Matrix\n")
print(sparse_matrix)
# Create the output vector (not sparse)
# 1. Set, for all rows, field in Y column to 0;
# 2. set Y to 1 when Improved == Marked;
# 3. Return Y column
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
# Following is the same process as other demo
cat("Learning...\n")
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
print(importance)
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
# Does these results make sense?
# Let's check some Chi2 between each of these features and the outcome.
print(chisq.test(df$Age, df$Y))
# Pearson correlation between Age and illness disapearing is 35
print(chisq.test(df$AgeDiscret, df$Y))
# Our first simplification of Age gives a Pearson correlation of 8.
print(chisq.test(df$AgeCat, df$Y))
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
# However it's almost always worse when you add some arbitrary rules.
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.

View File

@@ -0,0 +1,51 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
nround <- 2
param <- list(max.depth=2,eta=1,silent=1,nthread = 2, objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5,
metrics={'error'}, showsd = FALSE)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.R
##
print ('running cross validation, with cutomsized loss function')
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max.depth=2,eta=1,silent=1)
# train with customized objective
xgb.cv(param, dtrain, nround, nfold = 5,
obj = logregobj, feval=evalerror)
# do cross validation with prediction values for each fold
res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
res$dt
length(res$pred)

View File

@@ -0,0 +1,62 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
print ('start training with user customized objective')
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
#
# there can be cases where you want additional information
# being considered besides the property of DMatrix you can get by getinfo
# you can set additional information as attributes if DMatrix
# set label attribute of dtrain to be label, we use label as an example, it can be anything
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
# this is new customized objective, where you can access things you set
# same thing applies to customized evaluation function
logregobjattr <- function(preds, dtrain) {
# now you can access the attribute in customized function
labels <- attr(dtrain, 'label')
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
print ('start training with user customized objective, with additional attributes in DMatrix')
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobjattr, evalerror)

View File

@@ -0,0 +1,39 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
watchlist <- list(eval = dtest)
num_round <- 20
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
print ('start training with early Stopping setting')
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE,
early.stop.round = 3)
bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror,
maximize = FALSE, early.stop.round = 3)

View File

@@ -0,0 +1,34 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
# you can fit a linear regression, or logistic regression model
##
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param <- list(objective = "binary:logistic", booster = "gblinear",
nthread = 2, alpha = 0.0001, lambda = 1)
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
##
# the rest of settings are the same
##
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
labels <- getinfo(dtest, 'label')
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')

View File

@@ -0,0 +1,7 @@
data(mtcars)
head(mtcars)
bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
objective='count:poisson',nrounds=5)
pred = predict(bst,as.matrix(mtcars[,-11]))
sqrt(mean((pred-mtcars[,11])^2))

View File

@@ -0,0 +1,23 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
nround = 2
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest,'label')
### predict using first 1 tree
ypred1 = predict(bst, dtest, ntreelimit=1)
# by default, we predict using all the trees
ypred2 = predict(bst, dtest)
cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')

View File

@@ -0,0 +1,21 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
nround = 5
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
cat('start testing prediction from first n trees\n')
### predict using first 2 tree
pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
head(pred_with_leaf)
# by default, we predict using all the trees
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
head(pred_with_leaf)

11
R-package/demo/runall.R Normal file
View File

@@ -0,0 +1,11 @@
# running all scripts in demo folder
demo(basic_walkthrough)
demo(custom_objective)
demo(boost_from_prediction)
demo(predict_first_ntree)
demo(generalized_linear_model)
demo(cross_validation)
demo(create_sparse_matrix)
demo(predict_leaf_indices)
demo(early_stopping)
demo(poisson_regression)

View File

@@ -0,0 +1,32 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}
\title{Test part from Mushroom Data Set}
\format{A list containing a label vector, and a dgCMatrix object with 1611
rows and 126 variables}
\usage{
data(agaricus.test)
}
\description{
This data set is originally from the Mushroom data set,
UCI Machine Learning Repository.
}
\details{
This data set includes the following fields:
\itemize{
\item \code{label} the label for each record
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
}
}
\references{
https://archive.ics.uci.edu/ml/datasets/Mushroom
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science.
}
\keyword{datasets}

View File

@@ -0,0 +1,32 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}
\title{Training part from Mushroom Data Set}
\format{A list containing a label vector, and a dgCMatrix object with 6513
rows and 127 variables}
\usage{
data(agaricus.train)
}
\description{
This data set is originally from the Mushroom data set,
UCI Machine Learning Repository.
}
\details{
This data set includes the following fields:
\itemize{
\item \code{label} the label for each record
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
}
}
\references{
https://archive.ics.uci.edu/ml/datasets/Mushroom
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science.
}
\keyword{datasets}

42
R-package/man/getinfo.Rd Normal file
View File

@@ -0,0 +1,42 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/getinfo.xgb.DMatrix.R
\docType{methods}
\name{getinfo}
\alias{getinfo}
\alias{getinfo,xgb.DMatrix-method}
\title{Get information of an xgb.DMatrix object}
\usage{
getinfo(object, ...)
\S4method{getinfo}{xgb.DMatrix}(object, name)
}
\arguments{
\item{object}{Object of class \code{xgb.DMatrix}}
\item{...}{other parameters}
\item{name}{the name of the field to get}
}
\description{
Get information of an xgb.DMatrix object
}
\details{
The information can be one of the following:
\itemize{
\item \code{label}: label Xgboost learn from ;
\item \code{weight}: to do a weight rescale ;
\item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
\item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
}
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
}

View File

@@ -0,0 +1,22 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/nrow.xgb.DMatrix.R
\docType{methods}
\name{nrow,xgb.DMatrix-method}
\alias{nrow,xgb.DMatrix-method}
\title{Number of xgb.DMatrix rows}
\usage{
\S4method{nrow}{xgb.DMatrix}(x)
}
\arguments{
\item{x}{Object of class \code{xgb.DMatrix}}
}
\description{
\code{nrow} return the number of rows present in the \code{xgb.DMatrix}.
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
stopifnot(nrow(dtrain) == nrow(train$data))
}

View File

@@ -0,0 +1,43 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.R
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
}
\arguments{
\item{object}{Object of class "xgb.Boost"}
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is
untransformed margin value. In logistic regression, outputmargin=T will
output value before logistic transformation.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is
only valid for gbtree, but not for gblinear. set it to be value bigger
than 0. It will use all trees by default.}
\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
}
\description{
Predicted values based on xgboost model object.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
pred <- predict(bst, test$data)
}

View File

@@ -0,0 +1,18 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.handle.R
\docType{methods}
\name{predict,xgb.Booster.handle-method}
\alias{predict,xgb.Booster.handle-method}
\title{Predict method for eXtreme Gradient Boosting model handle}
\usage{
\S4method{predict}{xgb.Booster.handle}(object, ...)
}
\arguments{
\item{object}{Object of class "xgb.Boost.handle"}
\item{...}{Parameters pass to \code{predict.xgb.Booster}}
}
\description{
Predicted values based on xgb.Booster.handle object.
}

44
R-package/man/setinfo.Rd Normal file
View File

@@ -0,0 +1,44 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/setinfo.xgb.DMatrix.R
\docType{methods}
\name{setinfo}
\alias{setinfo}
\alias{setinfo,xgb.DMatrix-method}
\title{Set information of an xgb.DMatrix object}
\usage{
setinfo(object, ...)
\S4method{setinfo}{xgb.DMatrix}(object, name, info)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
}
\description{
Set information of an xgb.DMatrix object
}
\details{
It can be one of the following:
\itemize{
\item \code{label}: label Xgboost learn from ;
\item \code{weight}: to do a weight rescale ;
\item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
\item \code{group}.
}
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
}

31
R-package/man/slice.Rd Normal file
View File

@@ -0,0 +1,31 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/slice.xgb.DMatrix.R
\docType{methods}
\name{slice}
\alias{slice}
\alias{slice,xgb.DMatrix-method}
\title{Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object}
\usage{
slice(object, ...)
\S4method{slice}{xgb.DMatrix}(object, idxset, ...)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{idxset}{a integer vector of indices of rows needed}
}
\description{
Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
dsub <- slice(dtrain, 1:3)
}

View File

@@ -0,0 +1,30 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
\usage{
xgb.DMatrix(data, info = list(), missing = 0, ...)
}
\arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other information to pass to \code{info}.}
}
\description{
Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
}

View File

@@ -0,0 +1,24 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.save.R
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}
\usage{
xgb.DMatrix.save(DMatrix, fname)
}
\arguments{
\item{DMatrix}{the DMatrix object}
\item{fname}{the name of the binary file.}
}
\description{
Save xgb.DMatrix object to binary file
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
}

110
R-package/man/xgb.cv.Rd Normal file
View File

@@ -0,0 +1,110 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.cv.R
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
\usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
verbose = T, early_stop_round = NULL, early.stop.round = NULL,
maximize = NULL, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \link{xgb.train} for further details.
See also demo/ for walkthrough example in R.}
\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{option field, when data is \code{Matrix}}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{prediction}{A logical value indicating whether to return the prediction vector.}
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
\item{metrics,}{list of evaluation metrics to be used in corss validation,
when it is not specified, the evaluation metric is chosen according to objective function.
Possible options are:
\itemize{
\item \code{error} binary classification error rate
\item \code{rmse} Rooted mean square error
\item \code{logloss} negative log-likelihood function
\item \code{auc} Area under curve
\item \code{merror} Exact matching error, used to evaluate multi-class classification
}}
\item{obj}{customized objective function. Returns gradient and second order
gradient with given prediction and dtrain.}
\item{feval}{custimized evaluation function. Returns
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain.}
\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
If folds are supplied, the nfold and stratified parameters would be ignored.}
\item{verbose}{\code{boolean}, print the statistics during the process}
\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{early.stop.round}{An alternative of \code{early_stop_round}.}
\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.}
}
\value{
If \code{prediction = TRUE}, a list with the following elements is returned:
\itemize{
\item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
\item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
}
If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
}
\description{
The cross valudation function of xgboost
}
\details{
The original sample is randomly partitioned into \code{nfold} equal size subsamples.
Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
All observations are used for both training and validation.
Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
}
\examples{
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
max.depth =3, eta = 1, objective = "binary:logistic")
print(history)
}

45
R-package/man/xgb.dump.Rd Normal file
View File

@@ -0,0 +1,45 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.dump.R
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}
\usage{
xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE)
}
\arguments{
\item{model}{the model object.}
\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
\item{fmap}{feature map file representing the type of feature.
Detailed description could be found at
\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
See demo/ for walkthrough example in R, and
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
for example Format.}
\item{with.stats}{whether dump statistics of splits
When this option is on, the model dump comes with two additional statistics:
gain is the approximate loss function gain we get in each split;
cover is the sum of second order gradient in each node.}
}
\value{
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
}
\description{
Save a xgboost model to text file. Could be parsed later.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
# save the model in file 'xgb.model.dump'
xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
# print the model without saving it to a file
print(xgb.dump(bst))
}

View File

@@ -0,0 +1,70 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.importance.R
\name{xgb.importance}
\alias{xgb.importance}
\title{Show importance of features in a model}
\usage{
xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
data = NULL, label = NULL, target = function(x) ((x + label) == 2))
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
\item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
}
\value{
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
}
\description{
Read a xgboost model text dump.
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
}
\details{
This is the function to understand the model trained (and through your model, your data).
Results are returned for both linear and tree models.
\code{data.table} is returned by the function.
There are 3 columns :
\itemize{
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
}
Co-occurence count
------------------
The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
}
\examples{
data(agaricus.train, package='xgboost')
# Both dataset are list with two items, a sparse matrix and labels
# (labels = outcome column which will be learned).
# Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
# train$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.importance(train$data@Dimnames[[2]], model = bst)
# Same thing with co-occurence computation this time
xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label)
}

26
R-package/man/xgb.load.Rd Normal file
View File

@@ -0,0 +1,26 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.load.R
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}
\usage{
xgb.load(modelfile)
}
\arguments{
\item{modelfile}{the name of the binary file.}
}
\description{
Load xgboost model from the binary model file
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model')
pred <- predict(bst, test$data)
}

View File

@@ -0,0 +1,59 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.model.dt.tree.R
\name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table}
\usage{
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
model = NULL, text = NULL, n_first_tree = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
}
\value{
A \code{data.table} of the features used in the model with their gain, cover and few other thing.
}
\description{
Read a tree model text dump and return a data.table.
}
\details{
General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
The content of the \code{data.table} is organised that way:
\itemize{
\item \code{ID}: unique identifier of a node ;
\item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
\item \code{Split}: value of the chosen feature where is operated the split ;
\item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
\item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
\item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
\item \code{Quality}: it's the gain related to the split in this specific node ;
\item \code{Cover}: metric to measure the number of observation affected by the split ;
\item \code{Tree}: ID of the tree. It is included in the main ID ;
\item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
}
}
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
}

View File

@@ -0,0 +1,40 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.plot.importance.R
\name{xgb.plot.importance}
\alias{xgb.plot.importance}
\title{Plot feature importance bar graph}
\usage{
xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10))
}
\arguments{
\item{importance_matrix}{a \code{data.table} returned by the \code{xgb.importance} function.}
\item{numberOfClusters}{a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.}
}
\value{
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
}
\description{
Read a data.table containing feature importance details and plot it.
}
\details{
The purpose of this function is to easily represent the importance of each feature of a model.
The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
}
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#train$data@Dimnames[[2]] represents the column names of the sparse matrix.
importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst)
xgb.plot.importance(importance_matrix)
}

View File

@@ -0,0 +1,58 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.plot.tree.R
\name{xgb.plot.tree}
\alias{xgb.plot.tree}
\title{Plot a boosted tree model}
\usage{
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
\item{width}{the width of the diagram in pixels.}
\item{height}{the height of the diagram in pixels.}
}
\value{
A \code{DiagrammeR} of the model.
}
\description{
Read a tree model text dump.
Plotting only works for boosted tree model (not linear model).
}
\details{
The content of each node is organised that way:
\itemize{
\item \code{feature} value ;
\item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
\item \code{gain}: metric the importance of the node in the model.
}
Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
}
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
}

28
R-package/man/xgb.save.Rd Normal file
View File

@@ -0,0 +1,28 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.save.R
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}
\usage{
xgb.save(model, fname)
}
\arguments{
\item{model}{the model object.}
\item{fname}{the name of the binary file.}
}
\description{
Save xgboost model from xgboost or xgb.train
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model')
pred <- predict(bst, test$data)
}

View File

@@ -0,0 +1,27 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.save.raw.R
\name{xgb.save.raw}
\alias{xgb.save.raw}
\title{Save xgboost model to R's raw vector,
user can call xgb.load to load the model back from raw vector}
\usage{
xgb.save.raw(model)
}
\arguments{
\item{model}{the model object.}
}
\description{
Save xgboost model from xgboost or xgb.train
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
raw <- xgb.save.raw(bst)
bst <- xgb.load(raw)
pred <- predict(bst, test$data)
}

140
R-package/man/xgb.train.Rd Normal file
View File

@@ -0,0 +1,140 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgb.train.R
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}
\usage{
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
feval = NULL, verbose = 1, printEveryN=1L, early_stop_round = NULL,
early.stop.round = NULL, maximize = NULL, ...)
}
\arguments{
\item{params}{the list of parameters.
1. General Parameters
\itemize{
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
\item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
}
2. Booster Parameters
2.1. Parameter for Tree Booster
\itemize{
\item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
\item \code{max_depth} maximum depth of a tree. Default: 6
\item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
}
2.2. Parameter for Linear Booster
\itemize{
\item \code{lambda} L2 regularization term on weights. Default: 0
\item \code{lambda_bias} L2 regularization term on bias. Default: 0
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
}
3. Task Parameters
\itemize{
\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
\itemize{
\item \code{reg:linear} linear regression (Default).
\item \code{reg:logistic} logistic regression.
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
\item \code{num_class} set the number of classes. To use only with multiclass objectives.
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
\item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
}
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
\item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
}}
\item{data}{takes an \code{xgb.DMatrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{watchlist}{what information should be printed when \code{verbose=1} or
\code{verbose=2}. Watchlist is used to specify validation set monitoring
during training. For example user can specify
watchlist=list(validation1=mat1, validation2=mat2) to watch
the performance of each round's model on mat1 and mat2}
\item{obj}{customized objective function. Returns gradient and second order
gradient with given prediction and dtrain,}
\item{feval}{custimized evaluation function. Returns
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain,}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both}
\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{early.stop.round}{An alternative of \code{early_stop_round}.}
\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
}
\details{
This is the training function for \code{xgboost}.
It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
therefore it is more flexible than \code{\link{xgboost}} function.
Parallelization is automatically enabled if \code{OpenMP} is present.
Number of threads can also be manually specified via \code{nthread} parameter.
\code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
\itemize{
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
\item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
}
Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
}
\examples{
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- dtrain
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max.depth = 2, eta = 1, silent = 1)
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, logregobj, evalerror)
}

77
R-package/man/xgboost.Rd Normal file
View File

@@ -0,0 +1,77 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/xgboost.R
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
\usage{
xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
nrounds, verbose = 1, printEveryN=1L, early_stop_round = NULL, early.stop.round = NULL,
maximize = NULL, ...)
}
\arguments{
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{label}{the response variable. User should not set this field,
if data is local data file or \code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
\item{params}{the list of parameters.
Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
See also \code{demo/} for walkthrough example in R.}
\item{nrounds}{the max number of iterations}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both
performance and construction progress information}
\item{printEveryN}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{early.stop.round}{An alternative of \code{early_stop_round}.}
\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface.
}
\details{
This is the modeling function for Xgboost.
Parallelization is automatically enabled if \code{OpenMP} is present.
Number of threads can also be manually specified via \code{nthread} parameter.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
pred <- predict(bst, test$data)
}

8
R-package/src/Makevars Normal file
View File

@@ -0,0 +1,8 @@
# package root
PKGROOT=../../
# _*_ mode: Makefile; _*_
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o

View File

@@ -0,0 +1,19 @@
# package root
PKGROOT=./
# _*_ mode: Makefile; _*_
# This file is only used for windows compilation from github
# It will be replaced by Makevars in CRAN version
.PHONY: all xgblib
all: $(SHLIB)
$(SHLIB): xgblib
xgblib:
cp -r ../../src .
cp -r ../../wrapper .
cp -r ../../subtree .
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
$(OBJECTS) : xgblib

322
R-package/src/xgboost_R.cpp Normal file
View File

@@ -0,0 +1,322 @@
#include <vector>
#include <string>
#include <utility>
#include <cstring>
#include <cstdio>
#include <sstream>
#include "wrapper/xgboost_wrapper.h"
#include "src/utils/utils.h"
#include "src/utils/omp.h"
#include "xgboost_R.h"
using namespace std;
using namespace xgboost;
extern "C" {
void XGBoostAssert_R(int exp, const char *fmt, ...);
void XGBoostCheck_R(int exp, const char *fmt, ...);
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...);
}
// implements error handling
namespace xgboost {
namespace utils {
extern "C" {
void (*Printf)(const char *fmt, ...) = Rprintf;
int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R;
void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
void (*Error)(const char *fmt, ...) = error;
}
bool CheckNAN(double v) {
return ISNAN(v);
}
bool LogGamma(double v) {
return lgammafn(v);
}
} // namespace utils
namespace random {
void Seed(unsigned seed) {
warning("parameter seed is ignored, please set random seed using set.seed");
}
double Uniform(void) {
return unif_rand();
}
double Normal(void) {
return norm_rand();
}
} // namespace random
} // namespace xgboost
// call before wrapper starts
inline void _WrapperBegin(void) {
GetRNGstate();
}
// call after wrapper starts
inline void _WrapperEnd(void) {
PutRNGstate();
}
extern "C" {
SEXP XGCheckNullPtr_R(SEXP handle) {
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
}
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
_WrapperBegin();
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing) {
_WrapperBegin();
SEXP dim = getAttrib(mat, R_DimSymbol);
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
double *din = REAL(mat);
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nrow; ++i) {
for (size_t j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
}
}
void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
_WrapperBegin();
const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data);
int nindptr = length(indptr);
int ndata = length(data);
std::vector<bst_ulong> col_ptr_(nindptr);
std::vector<unsigned> indices_(ndata);
std::vector<float> data_(ndata);
for (int i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
indices_[i] = static_cast<unsigned>(p_indices[i]);
data_[i] = static_cast<float>(p_data[i]);
}
void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata);
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
_WrapperBegin();
int len = length(idxset);
std::vector<int> idxvec(len);
for (int i = 0; i < len; ++i) {
idxvec[i] = INTEGER(idxset)[i] - 1;
}
void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
_WrapperBegin();
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent));
_WrapperEnd();
}
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
_WrapperBegin();
int len = length(array);
const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) {
std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
}
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
} else {
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = REAL(array)[i];
}
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
BeginPtr(vec), len);
}
_WrapperEnd();
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
_WrapperBegin();
bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
_WrapperEnd();
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixNumRow_R(SEXP handle) {
bst_ulong nrow = XGDMatrixNumRow(R_ExternalPtrAddr(handle));
return ScalarInteger(static_cast<int>(nrow));
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGBoosterFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGBoosterCreate_R(SEXP dmats) {
_WrapperBegin();
int len = length(dmats);
std::vector<void*> dvec;
for (int i = 0; i < len; ++i){
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
}
void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
_WrapperBegin();
XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)),
CHAR(asChar(val)));
_WrapperEnd();
}
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
_WrapperBegin();
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
R_ExternalPtrAddr(dtrain));
_WrapperEnd();
}
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
_WrapperBegin();
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
int len = length(grad);
std::vector<float> tgrad(len), thess(len);
#pragma omp parallel for schedule(static)
for (int j = 0; j < len; ++j) {
tgrad[j] = REAL(grad)[j];
thess[j] = REAL(hess)[j];
}
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain),
BeginPtr(tgrad), BeginPtr(thess), len);
_WrapperEnd();
}
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
_WrapperBegin();
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
int len = length(dmats);
std::vector<void*> vec_dmats;
std::vector<std::string> vec_names;
std::vector<const char*> vec_sptr;
for (int i = 0; i < len; ++i) {
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
}
for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str());
}
const char *ret =
XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len);
_WrapperEnd();
return mkString(ret);
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
_WrapperBegin();
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(option_mask),
asInteger(ntree_limit),
&olen);
_WrapperEnd();
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
_WrapperBegin();
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
_WrapperEnd();
}
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
_WrapperBegin();
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
_WrapperEnd();
}
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
_WrapperBegin();
XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
RAW(raw),
length(raw));
_WrapperEnd();
}
SEXP XGBoosterModelToRaw_R(SEXP handle) {
bst_ulong olen;
_WrapperBegin();
const char *raw = XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen);
_WrapperEnd();
SEXP ret = PROTECT(allocVector(RAWSXP, olen));
if (olen != 0) {
memcpy(RAW(ret), raw, olen);
}
UNPROTECT(1);
return ret;
}
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
_WrapperBegin();
bst_ulong olen;
const char **res =
XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
asInteger(with_stats),
&olen);
_WrapperEnd();
SEXP out = PROTECT(allocVector(STRSXP, olen));
for (size_t i = 0; i < olen; ++i) {
stringstream stream;
stream << "booster["<<i<<"]\n" << res[i];
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
}
UNPROTECT(1);
return out;
}
}

156
R-package/src/xgboost_R.h Normal file
View File

@@ -0,0 +1,156 @@
#ifndef XGBOOST_WRAPPER_R_H_
#define XGBOOST_WRAPPER_R_H_
/*!
* \file xgboost_wrapper_R.h
* \author Tianqi Chen
* \brief R wrapper of xgboost
*/
extern "C" {
#include <Rinternals.h>
#include <R_ext/Random.h>
#include <Rmath.h>
}
extern "C" {
/*!
* \brief check whether a handle is NULL
* \param handle
* \return whether it is null ptr
*/
SEXP XGCheckNullPtr_R(SEXP handle);
/*!
* \brief load a data matrix
* \param fname name of the content
* \param silent whether print messages
* \return a loaded data matrix
*/
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
/*!
* \brief create matrix content from dense matrix
* This assumes the matrix is stored in column major format
* \param data R Matrix object
* \param missing which value to represent missing value
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing);
/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
/*!
* \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced
* \param idxset index set
* \return a sliced new matrix
*/
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
/*!
* \brief set information to dmatrix
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
*/
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
/*!
* \brief get info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief return number of rows
* \param handle a instance of data matrix
*/
SEXP XGDMatrixNumRow_R(SEXP handle);
/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached
*/
SEXP XGBoosterCreate_R(SEXP dmats);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
*/
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
* \param iter current iteration rounds
* \param dmats list of handles to dmatrices
* \param evname name of evaluation
* \return the string containing evaluation stati
*/
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
/*!
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param option_mask output_margin:1 predict_leaf:2
* \param ntree_limit limit number of trees used in prediction
*/
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
/*!
* \brief load model from raw array
* \param handle handle
*/
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw);
/*!
* \brief save model into R's raw array
* \param handle handle
* \return raw array
*/
SEXP XGBoosterModelToRaw_R(SEXP handle);
/*!
* \brief dump model into a string
* \param handle handle
* \param fmap name to fmap can be empty string
* \param with_stats whether dump statistics of splits
*/
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
}
#endif // XGBOOST_WRAPPER_R_H_

View File

@@ -0,0 +1,33 @@
#include <stdio.h>
#include <stdarg.h>
#include <Rinternals.h>
// implements error handling
void XGBoostAssert_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("AssertError:%s\n", buf);
}
}
void XGBoostCheck_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("%s\n", buf);
}
}
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) {
int ret;
va_list args;
va_start(args, fmt);
ret = vsnprintf(buf, size, fmt, args);
va_end(args);
return ret;
}

View File

@@ -0,0 +1,337 @@
---
title: "Understand your dataset with Xgboost"
output:
rmarkdown::html_vignette:
css: vignette.css
number_sections: yes
toc: yes
author: Tianqi Chen, Tong He, Michaël Benesty
vignette: >
%\VignetteIndexEntry{Discover your data}
%\VignetteEngine{knitr::rmarkdown}
\usepackage[utf8]{inputenc}
---
Introduction
============
The purpose of this Vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.
This Vignette is not about predicting anything (see [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **Xgboost** to highlight the *link* between the *features* of your data and the *outcome*.
Pacakge loading:
```{r libLoading, results='hold', message=F, warning=F}
require(xgboost)
require(Matrix)
require(data.table)
if (!require('vcd')) install.packages('vcd')
```
> **VCD** package is used for one of its embedded dataset only.
Preparation of the dataset
==========================
Numeric VS categorical variables
--------------------------------
**Xgboost** manages only `numeric` vectors.
What to do when you have *categorical* data?
A *categorical* variable has a fixed number of different values. For instance, if a variable called *Colour* can have only one of these three values, *red*, *blue* or *green*, then *Colour* is a *categorical* variable.
> In **R**, a *categorical* variable is called `factor`.
>
> Type `?factor` in the console for more information.
To answer the question above we will convert *categorical* variables to `numeric` one.
Conversion from categorical to numeric variables
------------------------------------------------
### Looking at the raw data
In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
```{r, results='hide'}
data(Arthritis)
df <- data.table(Arthritis, keep.rownames = F)
```
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `panda` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
The first thing we want to do is to have a look to the first lines of the `data.table`:
```{r}
head(df)
```
Now we will check the format of each column.
```{r}
str(df)
```
2 columns have `factor` type, one has `ordinal` type.
> `ordinal` variable :
>
> * can take a limited number of values (like `factor`) ;
> * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None`
### Creation of new features based on old ones
We will add some new *categorical* features to see if it helps.
#### Grouping per 10 years
For the first feature we create groups of age by rounding the real age.
Note that we transform it to `factor` so the algorithm treat these age groups as independent values.
Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
```{r}
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
```
#### Random split in two groups
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
```{r}
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
```
#### Risks in adding correlated features
These new features are highly correlated to the `Age` feature because they are simple transformations of this feature.
For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
#### Cleaning data
We remove ID as there is nothing to learn from this feature (it would just add some noise).
```{r, results='hide'}
df[,ID:=NULL]
```
We will list the different values for the column `Treatment`:
```{r}
levels(df[,Treatment])
```
### One-hot encoding
Next step, we will transform the categorical data to dummy variables.
This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
The purpose is to transform each value of each *categorical* feature in a *binary* feature `{0, 1}`.
For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
```{r, warning=FALSE,message=FALSE}
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
head(sparse_matrix)
```
> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
Create the output `numeric` vector (not as a sparse `Matrix`):
```{r}
output_vector = df[,Improved] == "Marked"
```
1. set `Y` vector to `0`;
2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
3. return `Y` vector.
Build the model
===============
The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
```{r}
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
```
You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
> Here you can see the numbers decrease until line 7 and then increase.
>
> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-)
Feature importance
==================
Measure feature importance
--------------------------
### Build the feature importance data.table
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
```{r}
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
head(importance)
```
> The column `Gain` provide the information we are looking for.
>
> As you can see, features are classified by `Gain`.
`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
`Cover` measures the relative quantity of observations concerned by a feature.
`Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
### Improvement in the interpretability of feature importance data.table
We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
One simple solution is to count the co-occurrences of a feature and a class of the classification.
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
```{r}
importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequence=NULL)]
head(importanceClean)
```
> In the table above we have removed two not needed columns and select only the first lines.
First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits.
How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents.
Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
Plotting the feature importance
-------------------------------
All these things are nice, but it would be even better to plot the results.
```{r, fig.width=8, fig.height=5, fig.align='center'}
xgb.plot.importance(importance_matrix = importanceRaw)
```
Feature have automatically been divided in 2 clusters: the interesting features... and the others.
> Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information.
According to the plot above, the most important features in this dataset to predict if the treatment will work are :
* the Age ;
* having received a placebo or not ;
* the sex is third but already included in the not interesting features group ;
* then we see our generated features (AgeDiscret). We can see that their contribution is very low.
Do these results make sense?
------------------------------
Let's check some **Chi2** between each of these features and the label.
Higher **Chi2** means better correlation.
```{r, warning=FALSE, message=FALSE}
c2 <- chisq.test(df$Age, output_vector)
print(c2)
```
Pearson correlation between Age and illness disapearing is **`r round(c2$statistic, 2 )`**.
```{r, warning=FALSE, message=FALSE}
c2 <- chisq.test(df$AgeDiscret, output_vector)
print(c2)
```
Our first simplification of Age gives a Pearson correlation is **`r round(c2$statistic, 2)`**.
```{r, warning=FALSE, message=FALSE}
c2 <- chisq.test(df$AgeCat, output_vector)
print(c2)
```
The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same.
Morality: don't let your *gut* lower the quality of your model.
In *data science* expression, there is the word *science* :-)
Conclusion
==========
As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that.
But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
Linear model may not be that smart in this scenario.
Special Note: What about Random Forests™?
==========================================
As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests™).
However, in Random Forests™ this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters!
**Warning**: this is still an experimental parameter.
For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
```{r, warning=FALSE, message=FALSE}
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
#Random Forest™ - 1000 trees
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nround = 1, objective = "binary:logistic")
#Boosting - 3 rounds
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nround = 3, objective = "binary:logistic")
```
> Note that the parameter `round` is set to `1`.
> [**Random Forests™**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.

View File

@@ -0,0 +1,225 @@
body {
margin: 0 auto;
background-color: white;
/* --------- FONT FAMILY --------
following are some optional font families. Usually a family
is safer to choose than a specific font,
which may not be on the users computer */
/ font-family:Georgia, Palatino, serif;
font-family: "Open Sans", "Book Antiqua", Palatino, serif;
/ font-family:Arial, Helvetica, sans-serif;
/ font-family:Tahoma, Verdana, Geneva, sans-serif;
/ font-family:Courier, monospace;
/ font-family:"Times New Roman", Times, serif;
/* -------------- COLOR OPTIONS ------------
following are additional color options for base font
you could uncomment another one to easily change the base color
or add one to a specific element style below */
color: #333333; /* dark gray not black */
/ color: #000000; /* black */
/ color: #666666; /* medium gray black */
/ color: #E3E3E3; /* very light gray */
/ color: white;
line-height: 100%;
max-width: 800px;
padding: 10px;
font-size: 17px;
text-align: justify;
text-justify: inter-word;
}
p {
line-height: 150%;
/ max-width: 540px;
max-width: 960px;
margin-bottom: 5px;
font-weight: 400;
/ color: #333333
}
h1, h2, h3, h4, h5, h6 {
font-weight: 400;
margin-top: 35px;
margin-bottom: 15px;
padding-top: 10px;
}
h1 {
margin-top: 70px;
color: #606AAA;
font-size:230%;
font-variant:small-caps;
padding-bottom:20px;
width:100%;
border-bottom:1px solid #606AAA;
}
h2 {
font-size:160%;
}
h3 {
font-size:130%;
}
h4 {
font-size:120%;
font-variant:small-caps;
}
h5 {
font-size:120%;
}
h6 {
font-size:120%;
font-variant:small-caps;
}
a {
color: #606AAA;
margin: 0;
padding: 0;
vertical-align: baseline;
}
a:hover {
text-decoration: blink;
color: green;
}
a:visited {
color: gray;
}
ul, ol {
padding: 0;
margin: 0px 0px 0px 50px;
}
ul {
list-style-type: square;
list-style-position: inside;
}
li {
line-height:150%
}
li ul, li ul {
margin-left: 24px;
}
pre {
padding: 0px 10px;
max-width: 800px;
white-space: pre-wrap;
}
code {
font-family: Consolas, Monaco, Andale Mono, monospace, courrier new;
line-height: 1.5;
font-size: 15px;
background: #F8F8F8;
border-radius: 4px;
padding: 5px;
display: inline-block;
max-width: 800px;
white-space: pre-wrap;
}
li code, p code {
background: #CDCDCD;
color: #606AAA;
padding: 0px 5px 0px 5px;
}
code.r, code.cpp {
display: block;
word-wrap: break-word;
border: 1px solid #606AAA;
}
aside {
display: block;
float: right;
width: 390px;
}
blockquote {
border-left:.5em solid #606AAA;
background: #F8F8F8;
padding: 0em 1em 0em 1em;
margin-left:10px;
max-width: 500px;
}
blockquote cite {
line-height:10px;
color:#bfbfbf;
}
blockquote cite:before {
/content: '\2014 \00A0';
}
blockquote p, blockquote li {
color: #666;
}
hr {
/ width: 540px;
text-align: left;
margin: 0 auto 0 0;
color: #999;
}
/* table */
table {
width: 100%;
border-top: 1px solid #919699;
border-left: 1px solid #919699;
border-spacing: 0;
}
table th {
padding: 4px 8px 4px 8px;
text-align: center;
color: white;
background: #606AAA;
border-bottom: 1px solid #919699;
border-right: 1px solid #919699;
}
table th p {
font-weight: bold;
margin-bottom: 0px;
}
table td {
padding: 8px;
vertical-align: top;
border-bottom: 1px solid #919699;
border-right: 1px solid #919699;
}
table td:last-child {
/background: lightgray;
text-align: right;
}
table td p {
margin-bottom: 0px;
}
table td p + p {
margin-top: 5px;
}
table td p + p + p {
margin-top: 5px;
}

View File

@@ -0,0 +1,221 @@
\documentclass{article}
\RequirePackage{url}
\usepackage{hyperref}
\RequirePackage{amsmath}
\RequirePackage{natbib}
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
\makeatletter
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
%\VignettePackage{xgboost}
% \VignetteEngine{knitr::knitr}
\makeatother
\begin{document}
%\SweaveOpts{concordance=TRUE}
<<knitropts,echo=FALSE,message=FALSE>>=
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
@
%
<<prelim,echo=FALSE>>=
xgboost.version = '0.3-0'
@
%
\begin{center}
\vspace*{6\baselineskip}
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
{\large \today}\par
\vfill
\end{center}
\thispagestyle{empty}
\clearpage
\setcounter{page}{1}
\section{Introduction}
This is an introductory document of using the \verb@xgboost@ package in R.
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy} \citep{friedman2000additive}.
The package includes efficient linear model solver and tree learning algorithm.
It supports various objective functions, including regression, classification
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
\begin{enumerate}
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
Windows and Linux, with openmp. It is generally over 10 times faster than
\verb@gbm@.}
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
\begin{itemize}
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
\item{Data File: }{Local data files}
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
\end{itemize}
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
and linear booster, and is optimized for sparse input.}
\item{Customization: }{\verb@xgboost@ supports customized objective function
and evaluation function}
\item{Performance: }{\verb@xgboost@ has better performance on several different
datasets.}
\end{enumerate}
\section{Example with Mushroom data}
In this section, we will illustrate some common usage of \verb@xgboost@. The
Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichman:2013}
<<Training and prediction with iris>>=
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1,
nround = 2, objective = "binary:logistic")
xgb.save(bst, 'model.save')
bst = xgb.load('model.save')
pred <- predict(bst, test$data)
@
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
\verb@predict@ does prediction on the model.
Here we can save the model to a binary local file, and load it when needed.
We can't inspect the trees inside. However we have another function to save the
model in plain text.
<<Dump Model>>=
xgb.dump(bst, 'model.dump')
@
The output looks like
\begin{verbatim}
booster[0]:
0:[f28<1.00001] yes=1,no=2,missing=2
1:[f108<1.00001] yes=3,no=4,missing=4
3:leaf=1.85965
4:leaf=-1.94071
2:[f55<1.00001] yes=5,no=6,missing=6
5:leaf=-1.70044
6:leaf=1.71218
booster[1]:
0:[f59<1.00001] yes=1,no=2,missing=2
1:leaf=-6.23624
2:[f28<1.00001] yes=3,no=4,missing=4
3:leaf=-0.96853
4:leaf=0.784718
\end{verbatim}
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
It speeds up \verb@xgboost@, and is needed for advanced features such as
training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>=
dtrain <- xgb.DMatrix(train$data, label = train$label)
class(dtrain)
head(getinfo(dtrain,'label'))
@
We can also save the matrix to a binary file. Then load it simply with
\verb@xgb.DMatrix@
<<save model>>=
xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
dtrain = xgb.DMatrix('xgb.DMatrix')
@
\section{Advanced Examples}
The function \verb@xgboost@ is a simple function with less parameter, in order
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
<<Customized loss function>>=
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- sqrt(mean((preds-labels)^2))
return(list(metric = "MSE", value = err))
}
dtest <- xgb.DMatrix(test$data, label = test$label)
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max.depth = 2, eta = 1, silent = 1)
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
@
The gradient and second order gradient is required for the output of customized
objective function.
We also have \verb@slice@ for row extraction. It is useful in
cross-validation.
For a walkthrough demo, please see \verb@R-package/demo/@ for further
details.
\section{The Higgs Boson competition}
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
Boson Machine Learning Challenge}.
Here are the instructions to make a submission
\begin{enumerate}
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
and extract them to \verb@data/@.
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
The computation will take less than a minute on Intel i7.
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
and submit your result.
\end{enumerate}
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
The training set contains 350000 records and 30 features.
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
still about two times faster than \verb@gbm@.
Meanwhile, the result from \verb@xgboost@ reaches
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
single model. This results stands in the
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
competition.
\bibliographystyle{jss}
\nocite{*} % list uncited references
\bibliography{xgboost}
\end{document}
<<Temp file cleaning, include=FALSE>>=
file.remove("xgb.DMatrix")
file.remove("model.dump")
file.remove("model.save")
@

View File

@@ -0,0 +1,30 @@
@article{friedman2001greedy,
title={Greedy function approximation: a gradient boosting machine},
author={Friedman, Jerome H},
journal={Annals of Statistics},
pages={1189--1232},
year={2001},
publisher={JSTOR}
}
@article{friedman2000additive,
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
journal={The annals of statistics},
volume={28},
number={2},
pages={337--407},
year={2000},
publisher={Institute of Mathematical Statistics}
}
@misc{
Bache+Lichman:2013 ,
author = "K. Bache and M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences"
}

View File

@@ -0,0 +1,405 @@
---
title: "Xgboost presentation"
output:
rmarkdown::html_vignette:
css: vignette.css
number_sections: yes
toc: yes
bibliography: xgboost.bib
author: Tianqi Chen, Tong He, Michaël Benesty
vignette: >
%\VignetteIndexEntry{Xgboost presentation}
%\VignetteEngine{knitr::rmarkdown}
\usepackage[utf8]{inputenc}
---
Introduction
============
**Xgboost** is short for e**X**treme **G**radient **Boost**ing package.
The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions.
It is an efficient and scalable implementation of gradient boosting framework by @friedman2000additive and @friedman2001greedy. Two solvers are included:
- *linear* model ;
- *tree learning* algorithm.
It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily.
It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions.
It has several features:
* Speed: it can automatically do parallel computation on *Windows* and *Linux*, with *OpenMP*. It is generally over 10 times faster than the classical `gbm`.
* Input Type: it takes several types of input data:
* *Dense* Matrix: *R*'s *dense* matrix, i.e. `matrix` ;
* *Sparse* Matrix: *R*'s *sparse* matrix, i.e. `Matrix::dgCMatrix` ;
* Data File: local data files ;
* `xgb.DMatrix`: its own class (recommended).
* Sparsity: it accepts *sparse* input for both *tree booster* and *linear booster*, and is optimized for *sparse* input ;
* Customization: it supports customized objective functions and evaluation functions.
Installation
============
Github version
--------------
For up-to-date version (highly recommended), install from *Github*:
```{r installGithub, eval=FALSE}
devtools::install_github('dmlc/xgboost', subdir='R-package')
```
> *Windows* user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
Cran version
------------
For stable version on *CRAN*, run:
```{r installCran, eval=FALSE}
install.packages('xgboost')
```
Learning
========
For the purpose of this tutorial we will load **Xgboost** package.
```{r libLoading, results='hold', message=F, warning=F}
require(xgboost)
```
Dataset presentation
--------------------
In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-).
Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.
Dataset loading
---------------
We will load the `agaricus` datasets embedded with the package and will link them to variables.
The datasets are already split in:
* `train`: will be used to build the model ;
* `test`: will be used to assess the quality of our model.
Why *split* the dataset in two parts?
In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
```{r datasetLoading, results='hold', message=F, warning=F}
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
```
> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/splitting.html).
Each variable is a `list` containing two things, `label` and `data`:
```{r dataList, message=F, warning=F}
str(train)
```
`label` is the outcome of our dataset meaning it is the binary *classification* we will try to predict.
Let's discover the dimensionality of our datasets.
```{r dataSize, message=F, warning=F}
dim(train$data)
dim(test$data)
```
This dataset is very small to not make the **R** package too heavy, however **Xgboost** is built to manage huge dataset very efficiently.
As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
```{r dataClass, message=F, warning=F}
class(train$data)[1]
class(train$label)
```
Basic Training using Xgboost
----------------------------
This step is the most critical part of the process for the quality of our model.
### Basic training
We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`.
In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, in a dataset mainly made of `0`, memory size is reduced. It is very usual to have such dataset.
We will train decision tree model using the following parameters:
* `objective = "binary:logistic"`: we will train a binary classification model ;
* `max.deph = 2`: the trees won't be deep, because our case is very simple ;
* `nthread = 2`: the number of cpu threads we are going to use;
* `nround = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
```{r trainingSparse, message=F, warning=F}
bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
```
> More complex the relationship between your features and your `label` is, more passes you need.
### Parameter variations
#### Dense matrix
Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix.
```{r trainingDense, message=F, warning=F}
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
```
#### xgb.DMatrix
**Xgboost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.
```{r trainingDmatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
```
#### Verbose option
**Xgboost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
```{r trainingVerbose0, message=T, warning=F}
# verbose = 0, no message
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 0)
```
```{r trainingVerbose1, message=T, warning=F}
# verbose = 1, print evaluation metric
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 1)
```
```{r trainingVerbose2, message=T, warning=F}
# verbose = 2, also print information about tree
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
```
Basic prediction using Xgboost
==============================
Perform the prediction
----------------------
The pupose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
```{r predicting, message=F, warning=F}
pred <- predict(bst, test$data)
# size of the prediction vector
print(length(pred))
# limit display of predictions to the first 10
print(head(pred))
```
These numbers doesn't look like *binary classification* `{0,1}`. We need to perform a simple transformation before being able to use these results.
Transform the regression in a binary classification
---------------------------------------------------
The only thing that **Xgboost** does is a *regression*. **Xgboost** is using `label` vector to build its *regression* model.
How can we use a *regression* model to perform a binary classification?
If we think about the meaning of a regression applied to our data, the numbers we get are probabilities that a datum will be classified as `1`. Therefore, we will set the rule that if this probability for a specific datum is `> 0.5` then the observation is classified as `1` (or `0` otherwise).
```{r predictingTest, message=F, warning=F}
prediction <- as.numeric(pred > 0.5)
print(head(prediction))
```
Measuring model performance
---------------------------
To measure the model performance, we will compute a simple metric, the *average error*.
```{r predictingAverageError, message=F, warning=F}
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
```
> Note that the algorithm has not seen the `test` data during the model construction.
Steps explanation:
1. `as.numeric(pred > 0.5)` applies our rule that when the probability (<=> regression <=> prediction) is `> 0.5` the observation is classified as `1` and `0` otherwise ;
2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ;
3. `mean(vectorOfErrors)` computes the *average error* itself.
The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**.
*Multiclass* classification works in a similar way.
This metric is **`r round(err, 2)`** and is pretty low: our yummly mushroom model works well!
Advanced features
=================
Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content.
Dataset preparation
-------------------
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
```{r DMatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
```
Measure learning progress with xgb.train
----------------------------------------
Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
One way to measure progress in learning of a model is to provide to **Xgboost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
> in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
```{r watchlist, message=F, warning=F}
watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
```
**Xgboost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
If with your own dataset you have not such results, you should think about how you did to divide your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
```{r watchlist2, message=F, warning=F}
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
```
> `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`.
Linear boosting
---------------
Until know, all the learnings we have performed were based on boosting trees. **Xgboost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
```{r linearBoosting, message=F, warning=F}
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
```
In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
Manipulating xgb.DMatrix
------------------------
### Save / Load
Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function.
```{r DMatrixSave, message=F, warning=F}
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
```
```{r DMatrixDel, include=FALSE}
file.remove("dtrain.buffer")
```
### Information extraction
Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data.
```{r getinfo, message=F, warning=F}
label = getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err))
```
View the trees from a model
---------------------------
You can dump the tree you learned using `xgb.dump` into a text file.
```{r dump, message=T, warning=F}
xgb.dump(bst, with.stats = T)
```
> if you provide a path to `fname` parameter you can save the trees to your hard drive.
Save and load models
--------------------
May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
Hopefully for you, **Xgboost** implements such functions.
```{r saveModel, message=F, warning=F}
# save model to binary local file
xgb.save(bst, "xgboost.model")
```
> `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise.
An interesting test to see how identic is our saved model with the original one would be to compare the two predictions.
```{r loadModel, message=F, warning=F}
# load binary model to R
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
# And now the test
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
```
```{r clean, include=FALSE}
# delete the created model
file.remove("./xgboost.model")
```
> result is `0`? We are good!
In some very specific cases, like when you want to pilot **Xgboost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
```{r saveLoadRBinVectorModel, message=F, warning=F}
# save model to R's raw vector
rawVec <- xgb.save.raw(bst)
# print class
print(class(rawVec))
# load binary model to R
bst3 <- xgb.load(rawVec)
pred3 <- predict(bst3, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
```
> Again `0`? It seems that `Xgboost` works pretty well!
References
==========

View File

@@ -1,4 +1,57 @@
xgboost
=======
XGBoost: eXtreme Gradient Boosting
==================================
General Purpose Gradient Boosting Library
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to Terascale data
Contributors: https://github.com/dmlc/xgboost/graphs/contributors
Documentations: [Documentation of xgboost](doc/README.md)
Issues Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion)
Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) to ask questions and share your experience on xgboost.
- Use issue tracker for bug reports, feature requests etc.
- Use the user group to post your experience, ask questions about general usages.
Gitter for developers [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
Distributed Version: [Distributed XGBoost](multi-node)
Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
What's New
==========
* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
* XGBoost wins [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
- Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
* [External Memory Version](doc/external_memory.md)
Features
========
* Easily accessible in python, R, Julia, CLI
* Fast speed and memory efficient
- Can be more than 10 times faster than GBM in sklearn and R
- Handles sparse matrices, support external memory
* Accurate prediction, and used extensively by data scientists and kagglers
- See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
* Distributed and Portable
- The distributed version runs on Hadoop (YARN), MPI, SGE etc.
- Scales to billions of examples and beyond
Build
=======
* Run ```bash build.sh``` (you can also type make)
- Normally it gives what you want
- See [Build Instruction](doc/build.md) for more information
Version
=======
* Current version xgboost-0.4, a lot improvment has been made since 0.3
- Change log in [CHANGES.md](CHANGES.md)
- This version is compatible with 0.3x versions
XGBoost in Graphlab Create
==========================
* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html
* Nice blogpost by Jay Gu using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand

21
build.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/bin/bash
# This is a simple script to make xgboost in MAC and Linux
# Basically, it first try to make with OpenMP, if fails, disable OpenMP and make it again.
# This will automatically make xgboost for MAC users who don't have OpenMP support.
# In most cases, type make will give what you want.
# See additional instruction in doc/build.md
if make; then
echo "Successfully build multi-thread xgboost"
else
echo "-----------------------------"
echo "Building multi-thread xgboost failed"
echo "Start to build single-thread xgboost"
make clean
make no_omp=1
echo "Successfully build single-thread xgboost"
echo "If you want multi-threaded version"
echo "See additional instructions in doc/build.md"
fi

1
demo/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.libsvm

52
demo/README.md Normal file
View File

@@ -0,0 +1,52 @@
XGBoost Examples
====
This folder contains all the code examples using xgboost.
* Contribution of examples, benchmarks is more than welcome!
* If you like to share how you use xgboost to solve your problem, send a pull request:)
Features Walkthrough
====
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
* Basic walkthrough of wrappers
[python](guide-python/basic_walkthrough.py)
[R](../R-package/demo/basic_walkthrough.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
* Customize loss function, and evaluation metric
[python](guide-python/custom_objective.py)
[R](../R-package/demo/custom_objective.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/custom_objective.jl)
* Boosting from existing prediction
[python](guide-python/boost_from_prediction.py)
[R](../R-package/demo/boost_from_prediction.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
* Predicting using first n trees
[python](guide-python/predict_first_ntree.py)
[R](../R-package/demo/boost_from_prediction.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
* Generalized Linear Model
[python](guide-python/generalized_linear_model.py)
[R](../R-package/demo/generalized_linear_model.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl)
* Cross validation
[python](guide-python/cross_validation.py)
[R](../R-package/demo/cross_validation.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)
* Predicting leaf indices
[python](guide-python/predict_leaf_indices.py)
[R](../R-package/demo/predict_leaf_indices.R)
Basic Examples by Tasks
====
Most of examples in this section are based on CLI or python version.
However, the parameter settings can be applied to all versions
* [Binary classification](binary_classification)
* [Multiclass classification](multiclass_classification)
* [Regression](regression)
* [Learning to Rank](rank)
Benchmarks
====
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)
* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)

View File

@@ -0,0 +1,172 @@
Binary Classification
====
This is the quick start tutorial for xgboost CLI version. You can also checkout [../../doc/README.md](../../doc/README.md) for links to tutorial in python or R.
Here we demonstrate how to use XGBoost for a binary classification task. Before getting started, make sure you compile xgboost in the root directory of the project by typing ```make```
The script runexp.sh can be used to run the demo. Here we use [mushroom dataset](https://archive.ics.uci.edu/ml/datasets/Mushroom) from UCI machine learning repository.
### Tutorial
#### Generate Input Data
XGBoost takes LibSVM format. An example of faked input data is below:
```
1 101:1.2 102:0.03
0 1:2.1 10001:300 10002:400
...
```
Each line represent a single instance, and in the first line '1' is the instance label,'101' and '102' are feature indices, '1.2' and '0.03' are feature values. In the binary classification case, '1' is used to indicate positive samples, and '0' is used to indicate negative samples. We also support probability values in [0,1] as label, to indicate the probability of the instance being positive.
First we will transform the dataset into classic LibSVM format and split the data into training set and test set by running:
```
python mapfeat.py
python mknfold.py agaricus.txt 1
```
The two files, 'agaricus.txt.train' and 'agaricus.txt.test' will be used as training set and test set.
#### Training
Then we can run the training process:
```
../../xgboost mushroom.conf
```
mushroom.conf is the configuration for both training and testing. Each line containing the [attribute]=[value] configuration:
```conf
# General Parameters, see comment for each definition
# can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
# The path of training data
data = "agaricus.txt.train"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "agaricus.txt.test"
# The path of test data
test:data = "agaricus.txt.test"
```
We use the tree booster and logistic regression objective in our setting. This indicates that we accomplish our task using classic gradient boosting regression tree(GBRT), which is a promising method for binary classification.
The parameters shown in the example gives the most common ones that are needed to use xgboost.
If you are interested in more parameter settings, the complete parameter settings and detailed descriptions are [here](../../doc/parameter.md). Besides putting the parameters in the configuration file, we can set them by passing them as arguments as below:
```
../../xgboost mushroom.conf max_depth=6
```
This means that the parameter max_depth will be set as 6 rather than 3 in the conf file. When you use command line, make sure max_depth=6 is passed in as single argument, i.e. do not contain space in the argument. When a parameter setting is provided in both command line input and the config file, the command line setting will override the setting in config file.
In this example, we use tree booster for gradient boosting. If you would like to use linear booster for regression, you can keep all the parameters except booster and the tree booster parameters as below:
```conf
# General Parameters
# choose the linear booster
booster = gblinear
...
# Change Tree Booster Parameters into Linear Booster Parameters
# L2 regularization term on weights, default 0
lambda = 0.01
# L1 regularization term on weights, default 0
f ```agaricus.txt.test.buffer``` exists, and automatically loads from binary buffer if possible, this can speedup training process when you do training many times. You can disable it by setting ```use_buffer=0```.
- Buffer file can also be used as standalone input, i.e if buffer file exists, but original agaricus.txt.test was removed, xgboost will still run
* Deviation from LibSVM input format: xgboost is compatible with LibSVM format, with the following minor differences:
- xgboost allows feature index starts from 0
- for binary classification, the label is 1 for positive, 0 for negative, instead of +1,-1
- the feature indices in each line *do not* need to be sorted
alpha = 0.01
# L2 regularization term on bias, default 0
lambda_bias = 0.01
# Regression Parameters
...
```
#### Get Predictions
After training, we can use the output model to get the prediction of the test data:
```
../../xgboost mushroom.conf task=pred model_in=0003.model
```
For binary classification, the output predictions are probability confidence scores in [0,1], corresponds to the probability of the label to be positive.
#### Dump Model
This is a preliminary feature, so far only tree model support text dump. XGBoost can display the tree models in text files and we can scan the model in an easy way:
```
../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt
../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt
```
In this demo, the tree boosters obtained will be printed in dump.raw.txt and dump.nice.txt, and the latter one is easier to understand because of usage of feature mapping featmap.txt
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
- Feature id must be from 0 to number of features, in sorted order.
- i means this feature is binary indicator feature
- q means this feature is a quantitative value, such as age, time, can be missing
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
#### Monitoring Progress
When you run training we can find there are messages displayed on screen
```
tree train end, 1 roots, 12 extra nodes, 0 pruned nodes ,max_depth=3
[0] test-error:0.016139
boosting round 1, 0 sec elapsed
tree train end, 1 roots, 10 extra nodes, 0 pruned nodes ,max_depth=3
[1] test-error:0.000000
```
The messages for evaluation are printed into stderr, so if you want only to log the evaluation progress, simply type
```
../../xgboost mushroom.conf 2>log.txt
```
Then you can find the following content in log.txt
```
[0] test-error:0.016139
[1] test-error:0.000000
```
We can also monitor both training and test statistics, by adding following lines to configure
```conf
eval[test] = "agaricus.txt.test"
eval[trainname] = "agaricus.txt.train"
```
Run the command again, we can find the log file becomes
```
[0] test-error:0.016139 trainname-error:0.014433
[1] test-error:0.000000 trainname-error:0.001228
```
The rule is eval[name-printed-in-log] = filename, then the file will be added to monitoring process, and evaluated each round.
xgboost also support monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes
```
[0] test-error:0.016139 test-negllik:0.029795 trainname-error:0.014433 trainname-negllik:0.027023
[1] test-error:0.000000 test-negllik:0.000000 trainname-error:0.001228 trainname-negllik:0.002457
```
### Saving Progress Models
If you want to save model every two round, simply set save_period=2. You will find 0002.model in the current folder. If you want to change the output folder of models, add model_dir=foldername. By default xgboost saves the model of last round.
#### Continue from Existing Model
If you want to continue boosting from existing model, say 0002.model, use
```
../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.model
```
xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
#### Use Multi-Threading
When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running threads to 10, add ```nthread=10``` to your configuration.
#### Additional Notes
* What are ```agaricus.txt.test.buffer``` and ```agaricus.txt.train.buffer``` generated during runexp.sh?
- By default xgboost will automatically generate a binary format buffer of input data, with suffix ```buffer```. When next time you run xgboost, it detects i
Demonstrating how to use XGBoost accomplish binary classification tasks on UCI mushroom dataset http://archive.ics.uci.edu/ml/datasets/Mushroom

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,32 @@
1. cap-shape: bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
green=r,orange=o,pink=p,purple=u,red=e,
white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n,
scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
urban=u,waste=w,woods=d

View File

@@ -0,0 +1,148 @@
1. Title: Mushroom Database
2. Sources:
(a) Mushroom records drawn from The Audubon Society Field Guide to North
American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
A. Knopf
(b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
(c) Date: 27 April 1987
3. Past Usage:
1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
Adjustment (Technical Report 87-19). Doctoral disseration, Department
of Information and Computer Science, University of California, Irvine.
--- STAGGER: asymptoted to 95% classification accuracy after reviewing
1000 instances.
2. Iba,W., Wogulis,J., & Langley,P. (1988). Trading off Simplicity
and Coverage in Incremental Concept Learning. In Proceedings of
the 5th International Conference on Machine Learning, 73-79.
Ann Arbor, Michigan: Morgan Kaufmann.
-- approximately the same results with their HILLARY algorithm
3. In the following references a set of rules (given below) were
learned for this data set which may serve as a point of
comparison for other researchers.
Duch W, Adamczak R, Grabczewski K (1996) Extraction of logical rules
from training data using backpropagation networks, in: Proc. of the
The 1st Online Workshop on Soft Computing, 19-30.Aug.1996, pp. 25-30,
available on-line at: http://www.bioele.nuee.nagoya-u.ac.jp/wsc1/
Duch W, Adamczak R, Grabczewski K, Ishikawa M, Ueda H, Extraction of
crisp logical rules using constrained backpropagation networks -
comparison of two new approaches, in: Proc. of the European Symposium
on Artificial Neural Networks (ESANN'97), Bruge, Belgium 16-18.4.1997,
pp. xx-xx
Wlodzislaw Duch, Department of Computer Methods, Nicholas Copernicus
University, 87-100 Torun, Grudziadzka 5, Poland
e-mail: duch@phys.uni.torun.pl
WWW http://www.phys.uni.torun.pl/kmk/
Date: Mon, 17 Feb 1997 13:47:40 +0100
From: Wlodzislaw Duch <duch@phys.uni.torun.pl>
Organization: Dept. of Computer Methods, UMK
I have attached a file containing logical rules for mushrooms.
It should be helpful for other people since only in the last year I
have seen about 10 papers analyzing this dataset and obtaining quite
complex rules. We will try to contribute other results later.
With best regards, Wlodek Duch
________________________________________________________________
Logical rules for the mushroom data sets.
Logical rules given below seem to be the simplest possible for the
mushroom dataset and therefore should be treated as benchmark results.
Disjunctive rules for poisonous mushrooms, from most general
to most specific:
P_1) odor=NOT(almond.OR.anise.OR.none)
120 poisonous cases missed, 98.52% accuracy
P_2) spore-print-color=green
48 cases missed, 99.41% accuracy
P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.
(stalk-color-above-ring=NOT.brown)
8 cases missed, 99.90% accuracy
P_4) habitat=leaves.AND.cap-color=white
100% accuracy
Rule P_4) may also be
P_4') population=clustered.AND.cap_color=white
These rule involve 6 attributes (out of 22). Rules for edible
mushrooms are obtained as negation of the rules given above, for
example the rule:
odor=(almond.OR.anise.OR.none).AND.spore-print-color=NOT.green
gives 48 errors, or 99.41% accuracy on the whole dataset.
Several slightly more complex variations on these rules exist,
involving other attributes, such as gill_size, gill_spacing,
stalk_surface_above_ring, but the rules given above are the simplest
we have found.
4. Relevant Information:
This data set includes descriptions of hypothetical samples
corresponding to 23 species of gilled mushrooms in the Agaricus and
Lepiota Family (pp. 500-525). Each species is identified as
definitely edible, definitely poisonous, or of unknown edibility and
not recommended. This latter class was combined with the poisonous
one. The Guide clearly states that there is no simple rule for
determining the edibility of a mushroom; no rule like ``leaflets
three, let it be'' for Poisonous Oak and Ivy.
5. Number of Instances: 8124
6. Number of Attributes: 22 (all nominally valued)
7. Attribute Information: (classes: edible=e, poisonous=p)
1. cap-shape: bell=b,conical=c,convex=x,flat=f,
knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,
pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
green=r,orange=o,pink=p,purple=u,red=e,
white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n,
scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
urban=u,waste=w,woods=d
8. Missing Attribute Values: 2480 of them (denoted by "?"), all for
attribute #11.
9. Class Distribution:
-- edible: 4208 (51.8%)
-- poisonous: 3916 (48.2%)
-- total: 8124 instances

View File

@@ -0,0 +1,49 @@
#!/usr/bin/python
def loadfmap( fname ):
fmap = {}
nmap = {}
for l in open( fname ):
arr = l.split()
if arr[0].find('.') != -1:
idx = int( arr[0].strip('.') )
assert idx not in fmap
fmap[ idx ] = {}
ftype = arr[1].strip(':')
content = arr[2]
else:
content = arr[0]
for it in content.split(','):
if it.strip() == '':
continue
k , v = it.split('=')
fmap[ idx ][ v ] = len(nmap)
nmap[ len(nmap) ] = ftype+'='+k
return fmap, nmap
def write_nmap( fo, nmap ):
for i in range( len(nmap) ):
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
# start here
fmap, nmap = loadfmap( 'agaricus-lepiota.fmap' )
fo = open( 'featmap.txt', 'w' )
write_nmap( fo, nmap )
fo.close()
fo = open( 'agaricus.txt', 'w' )
for l in open( 'agaricus-lepiota.data' ):
arr = l.split(',')
if arr[0] == 'p':
fo.write('1')
else:
assert arr[0] == 'e'
fo.write('0')
for i in range( 1,len(arr) ):
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
fo.write('\n')
fo.close()

View File

@@ -0,0 +1,29 @@
#!/usr/bin/python
import sys
import random
if len(sys.argv) < 2:
print ('Usage:<filename> <k> [nfold = 5]')
exit(0)
random.seed( 10 )
k = int( sys.argv[2] )
if len(sys.argv) > 3:
nfold = int( sys.argv[3] )
else:
nfold = 5
fi = open( sys.argv[1], 'r' )
ftr = open( sys.argv[1]+'.train', 'w' )
fte = open( sys.argv[1]+'.test', 'w' )
for l in fi:
if random.randint( 1 , nfold ) == k:
fte.write( l )
else:
ftr.write( l )
fi.close()
ftr.close()
fte.close()

View File

@@ -0,0 +1,29 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
# The path of training data
data = "agaricus.txt.train"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "agaricus.txt.test"
# evaluate on training data as well each round
eval_train = 1
# The path of test data
test:data = "agaricus.txt.test"

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# map feature using indicator encoding, also produce featmap.txt
python mapfeat.py
# split train and test
python mknfold.py agaricus.txt 1
# training and output the models
../../xgboost mushroom.conf
# output prediction task=pred
../../xgboost mushroom.conf task=pred model_in=0002.model
# print the boosters of 00002.model in dump.raw.txt
../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
# use the feature map in printing for better visualization
../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
cat dump.nice.txt

2
demo/data/README.md Normal file
View File

@@ -0,0 +1,2 @@
This folder contains processed example dataset used by the demos.
Copyright of the dataset belongs to the original copyright holder

1611
demo/data/agaricus.txt.test Normal file

File diff suppressed because it is too large Load Diff

6513
demo/data/agaricus.txt.train Normal file

File diff suppressed because it is too large Load Diff

126
demo/data/featmap.txt Normal file
View File

@@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@@ -0,0 +1,11 @@
XGBoost Python Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.py)
* [Cutomize loss function, and evaluation metric](custom_objective.py)
* [Boosting from existing prediction](boost_from_prediction.py)
* [Predicting using first n trees](predict_first_ntree.py)
* [Generalized Linear Model](generalized_linear_model.py)
* [Cross validation](cross_validation.py)
* [Predicting leaf indices](predict_leaf_indices.py)
* [Sklearn Wrapper](sklearn_example.py)
* [External Memory](external_memory.py)

View File

@@ -0,0 +1,72 @@
#!/usr/bin/python
import numpy as np
import scipy.sparse
import xgboost as xgb
### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# specify validations set to watch performance
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
bst.save_model('0001.model')
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.nice.txt','../data/featmap.txt')
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
bst.save_model('xgb.model')
# load model and data in
bst2 = xgb.Booster(model_file='xgb.model')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2-preds)) == 0
###
# build dmatrix from scipy.sparse
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
labels = []
row = []; col = []; dat = []
i = 0
for l in open('../data/agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
for it in arr[1:]:
k,v = it.split(':')
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr, label = labels )
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
# we can also construct from csc matrix
csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix(csc, label=labels)
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )
print ('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
# then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix(npymat, label = labels)
watchlist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, watchlist )

View File

@@ -0,0 +1,24 @@
#!/usr/bin/python
import numpy as np
import xgboost as xgb
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
watchlist = [(dtest,'eval'), (dtrain,'train')]
###
# advanced: start from a initial base prediction
#
print ('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# train xgboost for 1 round
bst = xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain = bst.predict(dtrain, output_margin=True)
ptest = bst.predict(dtest, output_margin=True)
dtrain.set_base_margin(ptrain)
dtest.set_base_margin(ptest)
print ('this is result of running from initial prediction')
bst = xgb.train( param, dtrain, 1, watchlist )

View File

@@ -0,0 +1,61 @@
#!/usr/bin/python
import numpy as np
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
print ('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.py
##
print ('running cross validation, with cutomsized loss function')
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror)

View File

@@ -0,0 +1,42 @@
#!/usr/bin/python
import numpy as np
import xgboost as xgb
###
# advanced: cutomsized loss function
#
print ('start running example to used cutomized objective function')
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'max_depth':2, 'eta':1, 'silent':1 }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
def evalerror(preds, dtrain):
labels = dtrain.get_label()
# return a pair metric_name, result
# since preds are margin(before logistic transformation, cutoff at 0)
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)

View File

@@ -0,0 +1,25 @@
#!/usr/bin/python
import numpy as np
import scipy.sparse
import xgboost as xgb
### simple example for using external memory version
# this is the only difference, add a # followed by a cache prefix name
# several cache file with the prefix will be generated
# currently only support convert from libsvm file
dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache')
dtest = xgb.DMatrix('../data/agaricus.txt.test#dtest.cache')
# specify validations set to watch performance
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# performance notice: set nthread to be the number of your real cpu
# some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4
#param['nthread']=num_real_cpu
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)

View File

@@ -0,0 +1,30 @@
#!/usr/bin/python
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1 }
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
# param['eta'] = 1
##
# the rest of settings are the same
##
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

View File

@@ -0,0 +1,20 @@
#!/usr/bin/python
import numpy as np
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)
print ('start testing prediction from first n trees')
### predict using first 1 tree
label = dtest.get_label()
ypred1 = bst.predict(dtest, ntree_limit=1)
# by default, we predict using all the trees
ypred2 = bst.predict(dtest)
print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))

View File

@@ -0,0 +1,20 @@
#!/usr/bin/python
import numpy as np
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)
print ('start testing predict the leaf indices')
### predict using first 2 tree
leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
print leafindex.shape
print leafindex
### predict all trees
leafindex = bst.predict(dtest, pred_leaf = True)
print leafindex.shape

Some files were not shown because too many files have changed in this diff Show More