Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Vadim Khotilovich 2016-06-09 02:32:26 -05:00
commit 264c222fe0
60 changed files with 1816 additions and 470 deletions

View File

@ -73,7 +73,7 @@ endif
# specify tensor path
.PHONY: clean all lint clean_all doxygen rcpplint pypack Rpack Rbuild Rcheck java
.PHONY: clean all lint clean_all doxygen rcpplint pypack Rpack Rbuild Rcheck java pylint
all: lib/libxgboost.a $(XGBOOST_DYLIB) xgboost
@ -131,8 +131,11 @@ rcpplint:
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src
lint: rcpplint
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin python-package
pylint:
flake8 --ignore E501 python-package
flake8 --ignore E501 tests/python
clean:
$(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o xgboost

View File

@ -31,7 +31,9 @@ This file records the changes in xgboost library in reverse chronological order.
* JVM Package
- Enable xgboost4j for java and scala
- XGBoost distributed now runs on Flink and Spark.
* Support model attributes listing for meta data.
- https://github.com/dmlc/xgboost/pull/1198
- https://github.com/dmlc/xgboost/pull/1166
## v0.47 (2016.01.14)

View File

@ -43,12 +43,7 @@ License
-------
© Contributors, 2016. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license.
Reference
---------
- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). Arxiv.1603.02754
- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016
- XGBoost originates from research project at University of Washington, see also the [Project Page at UW](http://dmlc.cs.washington.edu/xgboost.html).
Acknowledgements
----------------
- This work was supported in part by ONR (PECASE) N000141010672, NSF IIS 1258741 and the TerraSwarm Research Center sponsored by MARCO and DARPA.

View File

@ -66,6 +66,7 @@ However, the parameter settings can be applied to all versions
- [Starter script for Kaggle Higgs Boson](kaggle-higgs)
- [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
- [Benchmarking the most commonly used open source tools for binary classification](https://github.com/szilard/benchm-ml#boosting-gradient-boosted-treesgradient-boosting-machines)
## Machine Learning Challenge Winning Solutions
@ -85,6 +86,10 @@ Please send pull requests if you find ones that are missing here.
- Owen Zhang, 1st place of the [Avito Context Ad Clicks competition](https://www.kaggle.com/c/avito-context-ad-clicks). Link to [the Kaggle interview](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
- Keiichi Kuroyanagi, 2nd place of the [Airbnb New User Bookings](https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings). Link to [the Kaggle interview](http://blog.kaggle.com/2016/03/17/airbnb-new-user-bookings-winners-interview-2nd-place-keiichi-kuroyanagi-keiku/).
- Marios Michailidis, Mathias Müller and Ning Situ, 1st place [Homesite Quote Conversion](https://www.kaggle.com/c/homesite-quote-conversion). Link to [the Kaggle interview](http://blog.kaggle.com/2016/04/08/homesite-quote-conversion-winners-write-up-1st-place-kazanova-faron-clobber/).
## Talks
- [XGBoost: A Scalable Tree Boosting System](http://datascience.la/xgboost-workshop-and-meetup-talk-with-tianqi-chen/) (video+slides) by Tianqi Chen at the Los Angeles Data Science meetup
## Tutorials
- [XGBoost Official RMarkdown Tutorials](https://xgboost.readthedocs.org/en/latest/R-package/index.html#tutorials)

View File

@ -12,15 +12,18 @@ print ('running cross validation')
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
metrics={'error'}, seed = 0,
callbacks=[xgb.callback.print_evaluation(show_stdv=True)])
print ('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
metrics={'error'}, seed = 0,
callbacks=[xgb.callback.print_evaluation(show_stdv=False),
xgb.callback.early_stop(3)])
print (res)
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
@ -58,4 +61,3 @@ param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror)

@ -1 +1 @@
Subproject commit 1db0792e1a55355b1f07699bba18c88ded996953
Subproject commit 9fd3b48462a7a651e12a197679f71e043dcb25a2

View File

@ -9,7 +9,7 @@ You have find XGBoost R Package!
Get Started
-----------
* Checkout the [Installation Guide](../build.md) contains instructions to install xgboost, and [Tutorials](#tutorials) for examples on how to use xgboost for various tasks.
* Please visit [walk through example](demo).
* Please visit [walk through example](../../R-package/demo).
Tutorials
---------

20
doc/_static/cn.svg vendored Normal file
View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="640" height="480" viewBox="-5 -5 12.8 9.6">
<title>
Flag of the People&apos;s Republic of China
</title>
<defs>
<path id="b" fill="#ffde00" d="M-.588.81L0-1 .588.81-.952-.31H.952z"/>
<clipPath id="a">
<path d="M-5-15.4h26.667v20H-5z"/>
</clipPath>
</defs>
<g transform="matrix(.48 0 0 .48 -2.6 2.39)" clip-path="url(#a)">
<path fill="#de2910" d="M-5-15.4h30v20H-5z"/>
<use height="20" width="30" transform="matrix(3 0 0 3 0 -10.4)" xlink:href="#b"/>
<use height="20" width="30" transform="rotate(-120.93 -1.29 -8.12)" xlink:href="#b"/>
<use height="20" width="30" transform="rotate(-98.11 -1.45 -8.74)" xlink:href="#b"/>
<use height="20" width="30" transform="rotate(-74.04 -2.07 -8.84)" xlink:href="#b"/>
<use height="20" width="30" transform="rotate(-51.32 -4.16 -8.4)" xlink:href="#b"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 1005 B

25
doc/_static/js/auto_module_index.js vendored Normal file
View File

@ -0,0 +1,25 @@
function auto_index(module) {
$(document).ready(function () {
// find all classes or functions
var div_query = "div[class='section'][id='module-" + module + "']";
var class_query = div_query + " dl[class='class'] > dt";
var func_query = div_query + " dl[class='function'] > dt";
var targets = $(class_query + ',' + func_query);
var li_node = $("li a[href='#module-" + module + "']").parent();
var html = "<ul>";
for (var i = 0; i < targets.length; ++i) {
var id = $(targets[i]).attr('id');
// remove 'mxnet.' prefix to make menus shorter
var id_simple = id.replace(/^mxnet\./, '');
html += "<li><a class='reference internal' href='#";
html += id;
html += "'>" + id_simple + "</a></li>";
}
html += "</ul>";
li_node.append(html);
});
}

117
doc/_static/us.svg vendored Normal file
View File

@ -0,0 +1,117 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg id="svg153" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.w3.org/2000/svg" height="480" width="640" version="1.1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata id="metadata3151">
<rdf:RDF>
<cc:Work rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
</cc:Work>
</rdf:RDF>
</metadata>
<desc id="desc3066">The United States of America flag, produced by Daniel McRae</desc>
<defs id="defs155">
<clipPath id="clipPath4293" clipPathUnits="userSpaceOnUse">
<rect id="rect4295" fill-opacity="0.67" height="512" width="682.67" y=".0000052307" x="0"/>
</clipPath>
</defs>
<g id="flag" fill-rule="evenodd" clip-path="url(#clipPath4293)" transform="matrix(.93750 0 0 .93750 0 -.0000049038)">
<g id="g390" stroke-width="1pt" transform="matrix(3.9385 0 0 3.9385 0 .000005)">
<g id="g169" fill="#bd3d44">
<rect id="rect156" height="10" width="247" y="0" x="0"/>
<rect id="rect158" height="10" width="247" y="20" x="0"/>
<rect id="rect160" height="10" width="247" y="40" x="0"/>
<rect id="rect162" height="10" width="247" y="60" x="0"/>
<rect id="rect164" height="10" width="247" y="80" x="0"/>
<rect id="rect166" height="10" width="247" y="100" x="0"/>
<rect id="rect168" height="10" width="247" y="120" x="0"/>
</g>
<g id="g177" fill="#fff">
<rect id="rect157" height="10" width="247" y="10" x="0"/>
<rect id="rect159" height="10" width="247" y="30" x="0"/>
<rect id="rect161" height="10" width="247" y="50" x="0"/>
<rect id="rect163" height="10" width="247" y="70" x="0"/>
<rect id="rect165" height="10" width="247" y="90" x="0"/>
<rect id="rect167" height="10" width="247" y="110" x="0"/>
</g>
</g>
<rect id="rect200" height="275.69" width="389.12" y="0.000005" x="0" stroke-width="1pt" fill="#192f5d"/>
<g id="g274" fill="#fff" transform="matrix(3.9385 0 0 3.9385 0 .000005)">
<g id="g218">
<g id="g194">
<polygon id="polygon207" transform="translate(8.2333 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon188" transform="translate(24.7,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon189" transform="translate(41.167 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon190" transform="translate(57.633 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon191" transform="translate(74.1,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon192" transform="translate(90.567 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
<g id="g205">
<polygon id="polygon193" transform="translate(16.467 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon201" transform="translate(32.933 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon202" transform="translate(49.4,14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon203" transform="translate(65.867 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon204" transform="translate(82.333 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
</g>
<g id="g232" transform="translate(0,14)">
<g id="g233">
<polygon id="polygon234" transform="translate(8.2333 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon235" transform="translate(24.7,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon236" transform="translate(41.167 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon237" transform="translate(57.633 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon238" transform="translate(74.1,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon239" transform="translate(90.567 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
<g id="g240">
<polygon id="polygon241" transform="translate(16.467 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon242" transform="translate(32.933 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon243" transform="translate(49.4,14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon244" transform="translate(65.867 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon245" transform="translate(82.333 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
</g>
<g id="g246" transform="translate(0,28)">
<g id="g247">
<polygon id="polygon248" transform="translate(8.2333 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon249" transform="translate(24.7,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon250" transform="translate(41.167 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon251" transform="translate(57.633 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon252" transform="translate(74.1,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon253" transform="translate(90.567 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
<g id="g254">
<polygon id="polygon255" transform="translate(16.467 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon256" transform="translate(32.933 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon257" transform="translate(49.4,14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon258" transform="translate(65.867 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon259" transform="translate(82.333 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
</g>
<g id="g260" transform="translate(0,42)">
<g id="g261">
<polygon id="polygon262" transform="translate(8.2333 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon263" transform="translate(24.7,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon264" transform="translate(41.167 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon265" transform="translate(57.633 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon266" transform="translate(74.1,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon267" transform="translate(90.567 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
<g id="g268">
<polygon id="polygon269" transform="translate(16.467 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon270" transform="translate(32.933 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon271" transform="translate(49.4,14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon272" transform="translate(65.867 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon273" transform="translate(82.333 14)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
</g>
<g id="g211" transform="translate(0,56)">
<polygon id="polygon212" transform="translate(8.2333 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon213" transform="translate(24.7,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon214" transform="translate(41.167 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon215" transform="translate(57.633 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon216" transform="translate(74.1,7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
<polygon id="polygon217" transform="translate(90.567 7)" d="M 1.3084868e-6,-4.0040002 0.89895399,-1.237304 3.8080309,-1.2373029 1.4545381,0.47260808 2.3534912,3.239305 0,1.5293919 l -2.3534933,1.7099115 0.8989552,-2.76669531 -2.353492,-1.70991339 2.9090761,1.3e-6 z" points="0 -4.004 0.89895 -1.2373 3.808 -1.2373 1.4545 0.47261 2.3535 3.2393 0 1.5294 -2.3535 3.2393 -1.4545 0.47261 -3.808 -1.2373 -0.89895 -1.2373"/>
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 23 KiB

5
doc/_static/xgboost-theme/footer.html vendored Normal file
View File

@ -0,0 +1,5 @@
<div class="container">
<div class="footer">
<p> © 2015-2016 DMLC. All rights reserved. </p>
</div>
</div>

58
doc/_static/xgboost-theme/index.html vendored Normal file
View File

@ -0,0 +1,58 @@
<div class="splash">
<div class="container">
<div class="row">
<div class="col-lg-12">
<h1>Scalable and Flexible Gradient Boosting</h1>
<div id="social">
<span>
<iframe src="https://ghbtns.com/github-btn.html?user=dmlc&repo=xgboost&type=star&count=true&v=2"
frameborder="0" scrolling="0" width="120px" height="20px"></iframe>
<iframe src="https://ghbtns.com/github-btn.html?user=dmlc&repo=xgboost&type=fork&count=true&v=2"
frameborder="0" scrolling="0" width="100px" height="20px"></iframe>
</span>
</div> <!-- end of social -->
<div class="get_start">
<a href="get_started/" class="get_start_btn">Get Started</a>
</div> <!-- end of get started button -->
</div>
</div>
</div>
</div>
<div class="section-tout">
<div class="container">
<div class="row">
<div class="col-lg-4 col-sm-6">
<h3><i class="fa fa-flag"></i> Flexible</h3>
<p>Supports regression, classification, ranking and user defined objectives.
</p>
</div>
<div class="col-lg-4 col-sm-6">
<h3><i class="fa fa-cube"></i> Portable</h3>
<p>Runs on Windows, Lunix and OSX, as well as various cloud Platforms</p>
</div>
<div class="col-lg-4 col-sm-6">
<h3><i class="fa fa-wrench"></i>Multiple Lanuages</h3>
<p>Supports multiple languages including C++, Python, R, Java, Scala, Julia.</p>
</div>
<div class="col-lg-4 col-sm-6">
<h3><i class="fa fa-cogs"></i> Battle-tested</h3>
<p>Wins many data science and machine learning challenges.
Used in production by multiple companies.
</p>
</div>
<div class="col-lg-4 col-sm-6">
<h3><i class="fa fa-cloud"></i>Distributed on Cloud</h3>
<p>Supports distributed training on multiple machines, including AWS,
GCE, Azure, and Yarn clusters. Can be integrated with Flink, Spark and other cloud dataflow systems.</p>
</div>
<div class="col-lg-4 col-sm-6">
<h3><i class="fa fa-rocket"></i> Performance</h3>
<p>The well-optimized backend system for the best performance with limited resources.
The distributed version solves problems beyond billions of examples with same code.
</p>
</div>
</div>
</div>
</div>
</div>

156
doc/_static/xgboost-theme/layout.html vendored Normal file
View File

@ -0,0 +1,156 @@
{%- block doctype -%}
<!DOCTYPE html>
{%- endblock %}
{%- set reldelim1 = reldelim1 is not defined and ' &raquo;' or reldelim1 %}
{%- set reldelim2 = reldelim2 is not defined and ' |' or reldelim2 %}
{%- set render_sidebar = (not embedded) and (not theme_nosidebar|tobool) and
(sidebars != []) %}
{%- set url_root = pathto('', 1) %}
{%- if url_root == '#' %}{% set url_root = '' %}{% endif %}
{%- if not embedded and docstitle %}
{%- set titlesuffix = " &mdash; "|safe + docstitle|e %}
{%- else %}
{%- set titlesuffix = "" %}
{%- endif %}
{%- macro searchform(classes, button) %}
<form class="{{classes}}" role="search" action="{{ pathto('search') }}" method="get">
<div class="form-group">
<input type="text" name="q" class="form-control" {{ 'placeholder="Search"' if not button }} >
</div>
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
{% if button %}
<input type="submit" class="btn btn-default" value="search">
{% endif %}
</form>
{%- endmacro %}
{%- macro sidebarglobal() %}
<ul class="globaltoc">
{{ toctree(maxdepth=2|toint, collapse=False,includehidden=theme_globaltoc_includehidden|tobool) }}
</ul>
{%- endmacro %}
{%- macro sidebar() %}
{%- if render_sidebar %}
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
{%- block sidebartoc %}
{%- include "localtoc.html" %}
{%- endblock %}
</div>
</div>
{%- endif %}
{%- endmacro %}
{%- macro script() %}
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: '{{ url_root }}',
VERSION: '{{ release|e }}',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '{{ '' if no_search_suffix else file_suffix }}',
HAS_SOURCE: {{ has_source|lower }}
};
</script>
{% for name in ['jquery.js', 'underscore.js', 'doctools.js', 'searchtools.js'] %}
<script type="text/javascript" src="{{ pathto('_static/' + name, 1) }}"></script>
{% endfor %}
<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<!-- {%- for scriptfile in script_files %} -->
<!-- <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script> -->
<!-- {%- endfor %} -->
{%- endmacro %}
{%- macro css() %}
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
{% if pagename == 'index' %}
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">
{%- else %}
<link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" />
<link rel="stylesheet" href="{{ pathto('_static/pygments.css', 1) }}" type="text/css" />
{%- endif %}
<link rel="stylesheet" href="{{ pathto('_static/xgboost.css', 1) }}">
{%- endmacro %}
<html lang="en">
<head>
<meta charset="{{ encoding }}">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
{# The above 3 meta tags *must* come first in the head; any other head content
must come *after* these tags. #}
{{ metatags }}
{%- block htmltitle %}
{%- if pagename != 'index' %}
<title>{{ title|striptags|e }}{{ titlesuffix }}</title>
{%- else %}
<title>XGBoost Documents</title>
{%- endif %}
{%- endblock %}
{{ css() }}
{%- if not embedded %}
{{ script() }}
{%- if use_opensearch %}
<link rel="search" type="application/opensearchdescription+xml"
title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}"
href="{{ pathto('_static/opensearch.xml', 1) }}"/>
{%- endif %}
{%- if favicon %}
<link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/>
{%- endif %}
{%- endif %}
{%- block linktags %}
{%- if hasdoc('about') %}
<link rel="author" title="{{ _('About these documents') }}" href="{{ pathto('about') }}" />
{%- endif %}
{%- if hasdoc('genindex') %}
<link rel="index" title="{{ _('Index') }}" href="{{ pathto('genindex') }}" />
{%- endif %}
{%- if hasdoc('search') %}
<link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}" />
{%- endif %}
{%- if hasdoc('copyright') %}
<link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}" />
{%- endif %}
{%- if parents %}
<link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}" />
{%- endif %}
{%- if next %}
<link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}" />
{%- endif %}
{%- if prev %}
<link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}" />
{%- endif %}
{%- endblock %}
{%- block extrahead %} {% endblock %}
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png">
</head>
<body role="document">
{%- include "navbar.html" %}
{% if pagename != 'index' %}
<div class="container">
<div class="row">
{{ sidebar() }}
<div class="content">
{% block body %} {% endblock %}
{%- include "footer.html" %}
</div>
</div>
</div>
{%- else %}
{%- include "index.html" %}
{%- include "footer.html" %}
{%- endif %} <!-- pagename != index -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
</body>
</html>

40
doc/_static/xgboost-theme/navbar.html vendored Normal file
View File

@ -0,0 +1,40 @@
<div class="navbar navbar-default navbar-fixed-top">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul id="navbar" class="navbar navbar-left">
<li> <a href="{{url_root}}">XGBoost</a> </li>
{% for name in ['Get Started', 'Tutorials', 'How To'] %}
<li> <a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a> </li>
{% endfor %}
{% for name in ['Packages'] %}
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="{{url_root}}python/index.html">Python</a></li>
<li><a href="{{url_root}}R-package/index.html">R</a></li>
<li><a href="{{url_root}}jvm/index.html">JVM</a></li>
<li><a href="{{url_root}}julia/index.html">Julia</a></li>
<li><a href="{{url_root}}cli/index.html">CLI</a></li>
</ul>
</li>
{% endfor %}
<li> <a href="{{url_root}}/parameter.html"> Knobs </a> </li>
<li> {{searchform('', False)}} </li>
</ul>
<!--
<ul id="navbar" class="navbar navbar-right">
<li> <a href="{{url_root}}index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
<li> <a href="{{url_root}}/zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
</ul>
navbar -->
</div>
</div>
</div>

2
doc/_static/xgboost-theme/theme.conf vendored Normal file
View File

@ -0,0 +1,2 @@
[theme]
inherit = basic

232
doc/_static/xgboost.css vendored Normal file
View File

@ -0,0 +1,232 @@
/* header section */
.splash{
padding:5em 0 1em 0;
background-color:#0079b2;
/* background-image:url(../img/bg.jpg); */
background-size:cover;
background-attachment:fixed;
color:#fff;
text-align:center
}
.splash h1{
font-size: 40px;
margin-bottom: 20px;
}
.splash .social{
margin:2em 0
}
.splash .get_start {
margin:2em 0
}
.splash .get_start_btn {
border: 2px solid #FFFFFF;
border-radius: 5px;
color: #FFFFFF;
display: inline-block;
font-size: 26px;
padding: 9px 20px;
}
.section-tout{
padding:3em 0 3em;
border-bottom:1px solid rgba(0,0,0,.05);
background-color:#eaf1f1
}
.section-tout .fa{
margin-right:.5em
}
.section-tout h3{
font-size:20px;
}
.section-tout p {
margin-bottom:2em
}
.section-inst{
padding:3em 0 3em;
border-bottom:1px solid rgba(0,0,0,.05);
text-align:center
}
.section-inst p {
margin-bottom:2em
}
.section-inst img {
-webkit-filter: grayscale(90%); /* Chrome, Safari, Opera */
filter: grayscale(90%);
margin-bottom:2em
}
.section-inst img:hover {
-webkit-filter: grayscale(0%); /* Chrome, Safari, Opera */
filter: grayscale(0%);
}
.footer{
padding-top: 40px;
}
.footer li{
float:right;
margin-right:1.5em;
margin-bottom:1.5em
}
.footer p{
font-size: 15px;
color: #888;
clear:right;
margin-bottom:0
}
/* sidebar */
div.sphinxsidebar {
margin-top: 20px;
margin-left: 0;
position: fixed;
overflow-y: scroll;
width: 250px;
top: 52px;
bottom: 0;
display: none
}
div.sphinxsidebar ul { padding: 0 }
div.sphinxsidebar ul ul { margin-left: 15px }
@media (min-width:1200px) {
.content { float: right; width: 66.66666667%; margin-right: 5% }
div.sphinxsidebar {display: block}
}
.github-btn { border: 0; overflow: hidden }
.container {
margin-right: auto;
margin-left: auto;
padding-left: 15px;
padding-right: 15px
}
body>.container {
padding-top: 80px
}
body {
font-size: 16px;
}
pre {
font-size: 14px;
}
/* navbar */
.navbar {
background-color:#0079b2;
border: 0px;
height: 65px;
}
.navbar-right li {
display:inline-block;
vertical-align:top;
padding: 22px 4px;
}
.navbar-left li {
display:inline-block;
vertical-align:top;
padding: 17px 10px;
/* margin: 0 5px; */
}
.navbar-left li a {
font-size: 22px;
color: #fff;
}
.navbar-left > li > a:hover{
color:#fff;
}
.flag-icon {
background-size: contain;
background-position: 50%;
background-repeat: no-repeat;
position: relative;
display: inline-block;
width: 1.33333333em;
line-height: 1em;
}
.flag-icon:before {
content: "\00a0";
}
.flag-icon-cn {
background-image: url(./cn.svg);
}
.flag-icon-us {
background-image: url(./us.svg);
}
/* .flags { */
/* padding: 10px; */
/* } */
.navbar-brand >img {
width: 110px;
}
.dropdown-menu li {
padding: 0px 0px;
width: 120px;
}
.dropdown-menu li a {
color: #0079b2;
font-size: 20px;
}
.section h1 {
padding-top: 90px;
margin-top: -60px;
padding-bottom: 10px;
font-size: 28px;
}
.section h2 {
padding-top: 80px;
margin-top: -60px;
padding-bottom: 10px;
font-size: 22px;
}
.section h3 {
padding-top: 80px;
margin-top: -64px;
padding-bottom: 8px;
}
.section h4 {
padding-top: 80px;
margin-top: -64px;
padding-bottom: 8px;
}
dt {
margin-top: -76px;
padding-top: 76px;
}
dt:target, .highlighted {
background-color: #fff;
}
.section code.descname {
font-size: 1em;
}

3
doc/cli/index.md Normal file
View File

@ -0,0 +1,3 @@
# XGBoost Command Line version
See [XGBoost Command Line walkthrough](https://github.com/dmlc/xgboost/blob/master/demo/binary_classification/README.md)

View File

@ -117,10 +117,11 @@ todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
html_theme_path = ['_static']
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
# html_theme = 'alabaster'
html_theme = 'sphinx_rtd_theme'
html_theme = 'xgboost-theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,

80
doc/get_started/index.md Normal file
View File

@ -0,0 +1,80 @@
# Get Started with XGBoost
This is a quick started tutorial showing snippets for you to quickly try out xgboost
on the demo dataset on a binary classification task.
## Links to Helpful Other Resources
- See [Installation Guide](../build.md) on how to install xgboost.
- See [How to pages](../how_to/index.md) on various tips on using xgboost.
- See [Tutorials](../tutorials/index.md) on tutorials on specific tasks.
- See [Learning to use XGBoost by Examples](../../demo) for more code examples.
## Python
```python
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
```
## R
```r
# load data
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
# fit model
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic")
# predict
pred <- predict(bst, test$data)
```
## Julia
```julia
using XGBoost
# read data
train_X, train_Y = readlibsvm("demo/data/agaricus.txt.train", (6513, 126))
test_X, test_Y = readlibsvm("demo/data/agaricus.txt.test", (1611, 126))
# fit model
num_round = 2
bst = xgboost(train_X, num_round, label=train_Y, eta=1, max_depth=2)
# predict
pred = predict(bst, test_X)
```
## Scala
```scala
import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.scala.XGBoost
object XGBoostScalaExample {
def main(args: Array[String]) {
// read trainining data, available at xgboost/demo/data
val trainData =
new DMatrix("/path/to/agaricus.txt.train")
// define parameters
val paramMap = List(
"eta" -> 0.1,
"max_depth" -> 2,
"objective" -> "binary:logistic").toMap
// number of iterations
val round = 2
// train the model
val model = XGBoost.train(trainData, paramMap, round)
// run prediction
val predTrain = model.predict(trainData)
// save model to the file.
model.saveModel("/local/path/to/model")
}
}
```

16
doc/how_to/index.md Normal file
View File

@ -0,0 +1,16 @@
# XGBoost How To
This page contains guidelines to use and develop mxnets.
## Installation
- [How to Install XGBoost](../build.md)
## Use XGBoost in Specific Ways
- [Parameter tunning guide](param_tuning.md)
- [Use out of core computation for large dataset](external_memory.md)
## Develop and Hack XGBoost
- [Contribute to XGBoost](contribute.md)
## Frequently Ask Questions
- [FAQ](../faq.md)

View File

@ -1,59 +1,15 @@
XGBoost Documentation
=====================
This is document of xgboost library.
XGBoost is short for eXtreme gradient boosting. This is a library that is designed, and optimized for boosted (tree) algorithms.
The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate***
for large scale tree boosting.
This document is hosted at http://xgboost.readthedocs.org/. You can also browse most of the documents in github directly.
Package Documents
-----------------
This section contains language specific package guide.
* [XGBoost Command Line Usage Walkthrough](../demo/binary_classification/README.md)
These are used to generate the index used in search.
* [Python Package Document](python/index.md)
* [R Package Document](R-package/index.md)
* [Java/Scala Package Document](jvm/index.md)
* [XGBoost.jl Julia Package](https://github.com/dmlc/XGBoost.jl)
User Guides
-----------
This section contains users guides that are general across languages.
* [Installation Guide](build.md)
* [Introduction to Boosted Trees](model.md)
* [Distributed Training Tutorial](tutorial/aws_yarn.md)
* [Frequently Asked Questions](faq.md)
* [External Memory Version](external_memory.md)
* [Learning to use XGBoost by Example](../demo)
* [Parameters](parameter.md)
* [Text input format](input_format.md)
* [Notes on Parameter Tunning](param_tuning.md)
Tutorials
---------
This section contains official tutorials of XGBoost package.
See [Awesome XGBoost](https://github.com/dmlc/xgboost/tree/master/demo) for links to mores resources.
* [Introduction to XGBoost in R](R-package/xgboostPresentation.md) (R package)
- This is a general presentation about xgboost in R.
* [Discover your data with XGBoost in R](R-package/discoverYourData.md) (R package)
- This tutorial explaining feature analysis in xgboost.
* [Introduction of XGBoost in Python](python/python_intro.md) (python)
- This tutorial introduces the python package of xgboost
* [Understanding XGBoost Model on Otto Dataset](../demo/kaggle-otto/understandingXGBoostModel.Rmd) (R package)
- This tutorial teaches you how to use xgboost to compete kaggle otto challenge.
Developer Guide
---------------
* [Contributor Guide](dev-guide/contribute.md)
Indices and tables
------------------
```eval_rst
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
```
* [Julia Package Document](julia/index.md)
* [CLI Package Document](cli/index.md)
- [Howto Documents](how_to/index.md)
- [Get Started Documents](get_started/index.md)
- [Tutorials](tutorials/index.md)

3
doc/julia/index.md Normal file
View File

@ -0,0 +1,3 @@
# XGBoost.jl
See [XGBoost.jl Project page](https://github.com/dmlc/XGBoost.jl)

View File

@ -7,20 +7,24 @@ You have find XGBoost JVM Package!
Installation
------------
Currently, XGBoost4J only support installation from source. Building XGBoost4J using Maven requires Maven 3 or newer and Java 7+.
Currently, XGBoost4J only support installation from source. Building XGBoost4J using Maven requires Maven 3 or newer and Java 7+.
Before you install XGBoost4J, you need to define environment variable `JAVA_HOME` as your JDK directory to ensure that your compiler can find `jni.h` correctly, since XGBoost4J relies on JNI to implement the interaction between the JVM and native libraries.
After your `JAVA_HOME` is defined correctly, it is as simple as run `mvn package` under jvm-packages directory to install XGBoost4J.
After your `JAVA_HOME` is defined correctly, it is as simple as run `mvn package` under jvm-packages directory to install XGBoost4J. You can also skip the tests by running `mvn -DskipTests=true package`, if you are sure about the correctness of your local setup.
NOTE: XGBoost4J requires to run with Spark 1.6 or newer
XGBoost4J-Spark which integrates XGBoost with Spark requires to run with Spark 1.6 or newer (the default version is 1.6.1). You can build XGBoost4J-Spark as a component of XGBoost4J by running `mvn package` or specifying the spark version by `mvn -Dspark.version=1.6.0 package`.
Contents
--------
* [Java Overview Tutorial](java_intro.md)
Resources
---------
* [Code Examples](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example)
* [Java API Docs](http://dmlc.ml/docs/javadocs/index.html)
* [Scala API Docs]
## Scala API Docs
* [XGBoost4J](http://dmlc.ml/docs/scaladocs/xgboost4j/index.html)
* [XGBoost4J-Spark](http://dmlc.ml/docs/scaladocs/xgboost4j-spark/index.html)
* [XGBoost4J-Flink](http://dmlc.ml/docs/scaladocs/xgboost4j-flink/index.html)
* [XGBoost4J-Flink](http://dmlc.ml/docs/scaladocs/xgboost4j-flink/index.html)

View File

@ -13,7 +13,8 @@ In R-package, you can use .(dot) to replace under score in the parameters, for e
General Parameters
------------------
* booster [default=gbtree]
- which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function.
- which booster to use, can be gbtree, gblinear or dart.
 gbtree and dart use tree based model while gblinear uses linear function.
* silent [default=0]
- 0 means printing running messages, 1 means silent mode.
* nthread [default to maximum number of threads available if not set]
@ -72,7 +73,29 @@ Parameters for Tree Booster
but consider set to lower number for more accurate enumeration.
- range: (0, 1)
* scale_pos_weight, [default=0]
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases) See [Parameters Tuning](param_tuning.md) for more discussion. Also see Higgs Kaggle competition demo for examples: [R](../demo/kaggle-higgs/higgs-train.R ), [py1](../demo/kaggle-higgs/higgs-numpy.py ), [py2](../demo/kaggle-higgs/higgs-cv.py ), [py3](../demo/guide-python/cross_validation.py)
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases) See [Parameters Tuning](how_to/param_tuning.md) for more discussion. Also see Higgs Kaggle competition demo for examples: [R](../demo/kaggle-higgs/higgs-train.R ), [py1](../demo/kaggle-higgs/higgs-numpy.py ), [py2](../demo/kaggle-higgs/higgs-cv.py ), [py3](../demo/guide-python/cross_validation.py)
Additional parameters for Dart Booster
--------------------------------------
* sample_type [default="uniform"]
- type of sampling algorithm.
- "uniform": dropped trees are selected uniformly.
- "weighted": dropped trees are selected in proportion to weight.
* normalize_type [default="tree]
- type of normalization algorithm.
- "tree": New trees have the same weight of each of dropped trees.
weight of new trees are learning_rate / (k + learnig_rate)
dropped trees are scaled by a factor of k / (k + learning_rate)
- "forest": New trees have the same weight of sum of dropped trees (forest).
weight of new trees are learning_rate / (1 + learning_rate)
dropped trees are scaled by a factor of 1 / (1 + learning_rate)
* rate_drop [default=0.0]
- dropout rate.
- range: [0.0, 1.0]
* skip_drop [default=0.0]
- probability of skip dropout.
If a dropout is skipped, new trees are added in the same manner as gbtree.
- range: [0.0, 1.0]
Parameters for Linear Booster
-----------------------------

8
doc/tutorials/index.md Normal file
View File

@ -0,0 +1,8 @@
# XGBoost Tutorials
This section contains official tutorials inside XGBoost package.
See [Awesome XGBoost](https://github.com/dmlc/xgboost/tree/master/demo) for links to mores resources.
## Contents
- [Introduction to Boosted Trees](../model.md)
- [Distributed XGBoost YARN on AWS](aws_yarn.md)

View File

@ -61,7 +61,6 @@ object DistTrainWithSpark {
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[Booster]))
val sc = new SparkContext(sparkConf)
val sc = new SparkContext(sparkConf)
val inputTrainPath = args(1)
val outputModelPath = args(2)
// number of iterations
@ -73,7 +72,8 @@ object DistTrainWithSpark {
"max_depth" -> 2,
"objective" -> "binary:logistic").toMap
// use 5 distributed workers to train the model
val model = XGBoost.train(trainRDD, paramMap, numRound, nWorkers = 5)
// useExternalMemory indicates whether
val model = XGBoost.train(trainRDD, paramMap, numRound, nWorkers = 5, useExternalMemory = true)
// save model to HDFS path
model.saveModelToHadoop(outputModelPath)
}

View File

@ -23,6 +23,18 @@
<module>xgboost4j-spark</module>
<module>xgboost4j-flink</module>
</modules>
<profiles>
<profile>
<id>spark-1.x</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<properties>
<spark.version>1.6.1</spark.version>
<scala.binary.version>2.10</scala.binary.version>
</properties>
</profile>
</profiles>
<build>
<plugins>
<plugin>

View File

@ -32,8 +32,8 @@ public class ExternalMemory {
//this is the only difference, add a # followed by a cache prefix name
//several cache file with the prefix will be generated
//currently only support convert from libsvm file
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache");
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache");
DMatrix trainMat = new DMatrix("../demo/data/agaricus.txt.train#dtrain.cache");
DMatrix testMat = new DMatrix("../demo/data/agaricus.txt.test#dtest.cache");
//specify parameters
HashMap<String, Object> params = new HashMap<String, Object>();

View File

@ -28,7 +28,7 @@ object DistTrainWithSpark {
"usage: program num_of_rounds num_workers training_path test_path model_path")
sys.exit(1)
}
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoost-spark-example")
val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[Booster]))
val sc = new SparkContext(sparkConf)
@ -45,7 +45,8 @@ object DistTrainWithSpark {
"eta" -> 0.1f,
"max_depth" -> 2,
"objective" -> "binary:logistic").toMap
val xgboostModel = XGBoost.train(trainRDD, paramMap, numRound, nWorkers = args(1).toInt)
val xgboostModel = XGBoost.train(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
useExternalMemory = true)
xgboostModel.predict(new DMatrix(testSet))
// save model to HDFS path
xgboostModel.saveModelAsHadoopFile(outputModelPath)

View File

@ -28,8 +28,8 @@
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.10</artifactId>
<version>1.6.1</version>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -16,6 +16,8 @@
package ml.dmlc.xgboost4j.scala.spark
import java.nio.file.Paths
import scala.collection.mutable
import scala.collection.JavaConverters._
@ -41,7 +43,8 @@ object XGBoost extends Serializable {
trainingData: RDD[LabeledPoint],
xgBoostConfMap: Map[String, Any],
rabitEnv: mutable.Map[String, String],
numWorkers: Int, round: Int, obj: ObjectiveTrait, eval: EvalTrait): RDD[Booster] = {
numWorkers: Int, round: Int, obj: ObjectiveTrait, eval: EvalTrait,
useExternalMemory: Boolean): RDD[Booster] = {
import DataUtils._
val partitionedData = {
if (numWorkers > trainingData.partitions.length) {
@ -54,11 +57,19 @@ object XGBoost extends Serializable {
trainingData
}
}
val appName = partitionedData.context.appName
partitionedData.mapPartitions {
trainingSamples =>
rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
Rabit.init(rabitEnv.asJava)
val trainingSet = new DMatrix(new JDMatrix(trainingSamples, null))
val cacheFileName: String = {
if (useExternalMemory && trainingSamples.hasNext) {
s"$appName-dtrain_cache-${TaskContext.getPartitionId()}"
} else {
null
}
}
val trainingSet = new DMatrix(new JDMatrix(trainingSamples, cacheFileName))
val booster = SXGBoost.train(trainingSet, xgBoostConfMap, round,
watches = new mutable.HashMap[String, DMatrix]{put("train", trainingSet)}.toMap,
obj, eval)
@ -76,12 +87,15 @@ object XGBoost extends Serializable {
* workers equals to the partition number of trainingData RDD
* @param obj the user-defined objective function, null by default
* @param eval the user-defined evaluation function, null by default
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
* true, the user may save the RAM cost for running XGBoost within Spark
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
* @return XGBoostModel when successful training
*/
@throws(classOf[XGBoostError])
def train(trainingData: RDD[LabeledPoint], configMap: Map[String, Any], round: Int,
nWorkers: Int, obj: ObjectiveTrait = null, eval: EvalTrait = null): XGBoostModel = {
nWorkers: Int, obj: ObjectiveTrait = null, eval: EvalTrait = null,
useExternalMemory: Boolean = false): XGBoostModel = {
require(nWorkers > 0, "you must specify more than 0 workers")
val tracker = new RabitTracker(nWorkers)
implicit val sc = trainingData.sparkContext
@ -97,7 +111,7 @@ object XGBoost extends Serializable {
}
require(tracker.start(), "FAULT: Failed to start tracker")
val boosters = buildDistributedBoosters(trainingData, overridedConfMap,
tracker.getWorkerEnvs.asScala, nWorkers, round, obj, eval)
tracker.getWorkerEnvs.asScala, nWorkers, round, obj, eval, useExternalMemory)
val sparkJobThread = new Thread() {
override def run() {
// force the job

View File

@ -17,7 +17,7 @@
package ml.dmlc.xgboost4j.scala.spark
import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.SparkContext
import org.apache.spark.{TaskContext, SparkContext}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
@ -27,13 +27,23 @@ class XGBoostModel(_booster: Booster)(implicit val sc: SparkContext) extends Ser
/**
* Predict result with the given testset (represented as RDD)
* @param testSet test set representd as RDD
* @param useExternalCache whether to use external cache for the test set
*/
def predict(testSet: RDD[Vector]): RDD[Array[Array[Float]]] = {
def predict(testSet: RDD[Vector], useExternalCache: Boolean = false): RDD[Array[Array[Float]]] = {
import DataUtils._
val broadcastBooster = testSet.sparkContext.broadcast(_booster)
val appName = testSet.context.appName
testSet.mapPartitions { testSamples =>
if (testSamples.hasNext) {
val dMatrix = new DMatrix(new JDMatrix(testSamples, null))
val cacheFileName = {
if (useExternalCache) {
s"$appName-dtest_cache-${TaskContext.getPartitionId()}"
} else {
null
}
}
val dMatrix = new DMatrix(new JDMatrix(testSamples, cacheFileName))
Iterator(broadcastBooster.value.predict(dMatrix))
} else {
Iterator()

View File

@ -127,7 +127,7 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap,
new scala.collection.mutable.HashMap[String, String],
numWorkers = 2, round = 5, null, null)
numWorkers = 2, round = 5, null, null, false)
val boosterCount = boosterRDD.count()
assert(boosterCount === 2)
val boosters = boosterRDD.collect()
@ -210,4 +210,26 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
println(xgBoostModel.predict(testRDD))
}
test("training with external memory cache") {
sc.stop()
sc = null
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
val customSparkContext = new SparkContext(sparkConf)
val eval = new EvalError()
val trainingRDD = buildTrainingRDD(Some(customSparkContext))
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
import DataUtils._
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers, useExternalMemory = true)
assert(eval.eval(xgBoostModel.predict(testSetDMatrix), testSetDMatrix) < 0.1)
customSparkContext.stop()
// clean
val dir = new File(".")
for (file <- dir.listFiles() if file.getName.startsWith("XGBoostSuite-dtrain_cache")) {
file.delete()
}
}
}

View File

@ -2,8 +2,8 @@
ignore=tests
unexpected-special-method-signature,too-many-nested-blocks
disiable=unexpected-special-method-signature,too-many-nested-blocks
dummy-variables-rgx=(unused|)_.*
reports=no
reports=no

View File

@ -0,0 +1,217 @@
# coding: utf-8
# pylint: disable= invalid-name
"""Training Library containing training routines."""
from __future__ import absolute_import
from . import rabit
from .core import EarlyStopException
def _fmt_metric(value, show_stdv=True):
"""format metric string"""
if len(value) == 2:
return '%s:%g' % (value[0], value[1])
elif len(value) == 3:
if show_stdv:
return '%s:%g+%g' % (value[0], value[1], value[2])
else:
return '%s:%g' % (value[0], value[1])
else:
raise ValueError("wrong metric value")
def print_evaluation(period=1, show_stdv=True):
"""Create a callback that print evaluation result.
Parameters
----------
period : int
The period to log the evaluation results
show_stdv : bool, optional
Whether show stdv if provided
Returns
-------
callback : function
A callback that print evaluation every period iterations.
"""
def callback(env):
"""internal function"""
if env.rank != 0 or len(env.evaluation_result_list) == 0:
return
i = env.iteration
if (i % period == 0 or i + 1 == env.begin_iteration):
msg = '\t'.join([_fmt_metric(x, show_stdv) for x in env.evaluation_result_list])
rabit.tracker_print('[%d]\t%s\n' % (i, msg))
return callback
def record_evaluation(eval_result):
"""Create a call back that records the evaluation history into eval_result.
Parameters
----------
eval_result : dict
A dictionary to store the evaluation results.
Returns
-------
callback : function
The requested callback function.
"""
if not isinstance(eval_result, dict):
raise TypeError('eval_result has to be a dictionary')
eval_result.clear()
def init(env):
"""internal function"""
for k, _ in env.evaluation_result_list:
key, metric = k.split('-')
if key not in eval_result:
eval_result[key] = {}
if metric not in eval_result[key]:
eval_result[key][metric] = []
def callback(env):
"""internal function"""
if len(eval_result) == 0:
init(env)
for k, v in env.evaluation_result_list:
key, metric = k.split('-')
eval_result[key][metric].append(v)
return callback
def reset_learning_rate(learning_rates):
"""Reset learning rate after iteration 1
NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters
----------
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates eta in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: eta = l[boosting round]
- function f: eta = f(boosting round, num_boost_round)
Returns
-------
callback : function
The requested callback function.
"""
def callback(env):
"""internal function"""
bst = env.model
i = env.iteration
if isinstance(learning_rates, list):
if len(learning_rates) != env.end_iteration:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
bst.set_param('learning_rate', learning_rates[i])
else:
bst.set_param('learning_rate', learning_rates(i, env.end_iteration))
callback.before_iteration = True
return callback
def early_stop(stopping_rounds, maximize=False, verbose=True):
"""Create a callback that activates early stoppping.
Validation error needs to decrease at least
every <stopping_rounds> round(s) to continue training.
Requires at least one item in evals.
If there's more than one, will use the last.
Returns the model from the last iteration (not the best one).
If early stopping occurs, the model will have three additional fields:
bst.best_score, bst.best_iteration and bst.best_ntree_limit.
(Use bst.best_ntree_limit to get the correct value if num_parallel_tree
and/or num_class appears in the parameters)
Parameters
----------
stopp_rounds : int
The stopping rounds before the trend occur.
maximize : bool
Whether to maximize evaluation metric.
verbose : optional, bool
Whether to print message about early stopping information.
Returns
-------
callback : function
The requested callback function.
"""
state = {}
def init(env):
"""internal function"""
bst = env.model
if len(env.evaluation_result_list) == 0:
raise ValueError('For early stopping you need at least one set in evals.')
if len(env.evaluation_result_list) > 1 and verbose:
msg = ("Multiple eval metrics have been passed: "
"'{0}' will be used for early stopping.\n\n")
rabit.tracker_print(msg.format(env.evaluation_result_list[-1][0]))
maximize_metrics = ('auc', 'map', 'ndcg')
maximize_score = maximize
metric = env.evaluation_result_list[-1][0]
if any(env.evaluation_result_list[-1][0].split('-')[1].startswith(x)
for x in maximize_metrics):
maximize_score = True
if verbose and env.rank == 0:
msg = "Will train until {} hasn't improved in {} rounds.\n"
rabit.tracker_print(msg.format(metric, stopping_rounds))
state['maximize_score'] = maximize_score
state['best_iteration'] = 0
if maximize_score:
state['best_score'] = float('-inf')
else:
state['best_score'] = float('inf')
if bst is not None:
if bst.attr('best_score') is not None:
state['best_score'] = float(bst.attr('best_score'))
state['best_iteration'] = int(bst.attr('best_iteration'))
state['best_msg'] = bst.attr('best_msg')
else:
bst.set_attr(best_iteration=str(state['best_iteration']))
bst.set_attr(best_score=str(state['best_score']))
else:
assert env.cvfolds is not None
def callback(env):
"""internal function"""
score = env.evaluation_result_list[-1][1]
if len(state) == 0:
init(env)
best_score = state['best_score']
best_iteration = state['best_iteration']
maximize_score = state['maximize_score']
if (maximize_score and score > best_score) or \
(not maximize_score and score < best_score):
msg = '[%d]\t%s' % (
env.iteration,
'\t'.join([_fmt_metric(x) for x in env.evaluation_result_list]))
state['best_msg'] = msg
state['best_score'] = score
state['best_iteration'] = env.iteration
# save the property to attributes, so they will occur in checkpoint.
if env.model is not None:
env.model.set_attr(best_score=str(state['best_score']),
best_iteration=str(state['best_iteration']),
best_msg=state['best_msg'])
elif env.iteration - best_iteration >= stopping_rounds:
best_msg = state['best_msg']
if verbose and env.rank == 0:
msg = "Stopping. Best iteration:\n{}\n\n"
rabit.tracker_print(msg.format(best_msg))
raise EarlyStopException(best_iteration)
return callback

View File

@ -1,5 +1,5 @@
# coding: utf-8
# pylint: disable=unused-import, invalid-name, wrong-import-position
# pylint: disable= invalid-name, unused-import
"""For compatibility"""
from __future__ import absolute_import
@ -14,12 +14,14 @@ if PY3:
STRING_TYPES = str,
def py_str(x):
"""convert c string back to python string"""
return x.decode('utf-8')
else:
# pylint: disable=invalid-name
STRING_TYPES = basestring,
def py_str(x):
"""convert c string back to python string"""
return x
try:

View File

@ -1,5 +1,6 @@
# coding: utf-8
# pylint: disable=too-many-arguments, too-many-branches
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
# pylint: disable=too-many-branches, too-many-lines, W0141
"""Core XGBoost Library."""
from __future__ import absolute_import
@ -22,6 +23,31 @@ class XGBoostError(Exception):
pass
class EarlyStopException(Exception):
"""Exception to signal early stopping.
Parameters
----------
best_iteration : int
The best iteration stopped.
"""
def __init__(self, best_iteration):
super(EarlyStopException, self).__init__()
self.best_iteration = best_iteration
# Callback environment used by callbacks
CallbackEnv = collections.namedtuple(
"XGBoostCallbackEnv",
["model",
"cvfolds",
"iteration",
"begin_iteration",
"end_iteration",
"rank",
"evaluation_result_list"])
def from_pystr_to_cstr(data):
"""Convert a list of Python str to C pointer
@ -657,7 +683,7 @@ class Booster(object):
def __copy__(self):
return self.__deepcopy__(None)
def __deepcopy__(self, memo):
def __deepcopy__(self, _):
return Booster(model_file=self.save_raw())
def copy(self):
@ -975,7 +1001,6 @@ class Booster(object):
_check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, length))
def dump_model(self, fout, fmap='', with_stats=False):
# pylint: disable=consider-using-enumerate
"""
Dump model into a text file.
@ -1143,10 +1168,12 @@ class Booster(object):
msg = 'feature_names mismatch: {0} {1}'
if dat_missing:
msg += '\nexpected ' + ', '.join(str(s) for s in dat_missing) + ' in input data'
msg += ('\nexpected ' + ', '.join(str(s) for s in dat_missing) +
' in input data')
if my_missing:
msg += '\ntraining data did not have the following fields: ' + ', '.join(str(s) for s in my_missing)
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))
raise ValueError(msg.format(self.feature_names,
data.feature_names))
@ -1161,23 +1188,25 @@ class Booster(object):
The name of feature map file.
bin: int, default None
The maximum number of bins.
Number of bins equals number of unique split values n_unique, if bins == None or bins > n_unique.
Number of bins equals number of unique split values n_unique,
if bins == None or bins > n_unique.
as_pandas : bool, default True
Return pd.DataFrame when pandas is installed.
If False or pandas is not installed, return numpy ndarray.
Returns
-------
a histogram of used splitting values for the specified feature either as numpy array or pandas DataFrame.
a histogram of used splitting values for the specified feature
either as numpy array or pandas DataFrame.
"""
xgdump = self.get_dump(fmap=fmap)
values = []
regexp = re.compile("\[{0}<([\d.Ee+-]+)\]".format(feature))
regexp = re.compile(r"\[{0}<([\d.Ee+-]+)\]".format(feature))
for i in range(len(xgdump)):
m = re.findall(regexp, xgdump[i])
values.extend(map(float, m))
n_unique = np.unique(values).shape[0]
n_unique = len(np.unique(values))
bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
nph = np.histogram(values, bins=bins)
@ -1187,7 +1216,8 @@ class Booster(object):
if as_pandas and PANDAS_INSTALLED:
return DataFrame(nph, columns=['SplitValue', 'Count'])
elif as_pandas and not PANDAS_INSTALLED:
sys.stderr.write("Returning histogram as ndarray (as_pandas == True, but pandas is not installed).")
sys.stderr.write(
"Returning histogram as ndarray (as_pandas == True, but pandas is not installed).")
return nph
else:
return nph

View File

@ -1,3 +1,6 @@
# coding: utf-8
# pylint: disable= invalid-name
"""Distributed XGBoost Rabit related API."""
from __future__ import absolute_import
import sys
@ -179,7 +182,7 @@ def allreduce(data, op, prepare_fun=None):
else:
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def pfunc(args):
def pfunc(_):
"""prepare function."""
prepare_fun(data)
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),

View File

@ -1,5 +1,5 @@
# coding: utf-8
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912
"""Scikit-Learn Wrapper interface for XGBoost."""
from __future__ import absolute_import
@ -42,6 +42,7 @@ def _objective_decorator(func):
``dmatrix.get_label()``
"""
def inner(preds, dmatrix):
"""internal function"""
labels = dmatrix.get_label()
return func(labels, preds)
return inner
@ -79,9 +80,9 @@ class XGBModel(XGBModelBase):
colsample_bylevel : float
Subsample ratio of columns for each split, in each level.
reg_alpha : float (xgb's alpha)
L2 regularization term on weights
reg_lambda : float (xgb's lambda)
L1 regularization term on weights
reg_lambda : float (xgb's lambda)
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
@ -183,7 +184,7 @@ class XGBModel(XGBModelBase):
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True):
# pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init, redefined-variable-type
# pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
"""
Fit the gradient boosting model
@ -351,7 +352,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True):
# pylint: disable = attribute-defined-outside-init,arguments-differ, redefined-variable-type
# pylint: disable = attribute-defined-outside-init,arguments-differ
"""
Fit gradient boosting classifier
@ -440,6 +441,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
evals_result=evals_result, obj=obj, feval=feval,
verbose_eval=verbose)
self.objective = xgb_options["objective"]
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]

View File

@ -1,20 +1,122 @@
# coding: utf-8
# pylint: disable=too-many-locals, too-many-arguments, invalid-name
# pylint: disable=too-many-branches
# pylint: disable=too-many-branches, too-many-statements
"""Training Library containing training routines."""
from __future__ import absolute_import
import sys
import re
import numpy as np
from .core import Booster, STRING_TYPES, XGBoostError
from .core import Booster, STRING_TYPES, XGBoostError, CallbackEnv, EarlyStopException
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
from . import rabit
from . import callback
def _train_internal(params, dtrain,
num_boost_round=10, evals=(),
obj=None, feval=None,
xgb_model=None, callbacks=None):
"""internal training function"""
callbacks = [] if callbacks is None else callbacks
evals = list(evals)
if isinstance(params, dict) \
and 'eval_metric' in params \
and isinstance(params['eval_metric'], list):
params = dict((k, v) for k, v in params.items())
eval_metrics = params['eval_metric']
params.pop("eval_metric", None)
params = list(params.items())
for eval_metric in eval_metrics:
params += [('eval_metric', eval_metric)]
bst = Booster(params, [dtrain] + [d[0] for d in evals])
nboost = 0
num_parallel_tree = 1
if xgb_model is not None:
if not isinstance(xgb_model, STRING_TYPES):
xgb_model = xgb_model.save_raw()
bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
nboost = len(bst.get_dump())
else:
bst = Booster(params, [dtrain] + [d[0] for d in evals])
_params = dict(params) if isinstance(params, list) else params
if 'num_parallel_tree' in _params:
num_parallel_tree = _params['num_parallel_tree']
nboost //= num_parallel_tree
if 'num_class' in _params:
nboost //= _params['num_class']
# Distributed code: Load the checkpoint from rabit.
version = bst.load_rabit_checkpoint()
assert(rabit.get_world_size() != 1 or version == 0)
rank = rabit.get_rank()
start_iteration = int(version / 2)
nboost += start_iteration
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
for i in range(start_iteration, num_boost_round):
for cb in callbacks_before_iter:
cb(CallbackEnv(model=bst,
cvfolds=None,
iteration=i,
begin_iteration=start_iteration,
end_iteration=num_boost_round,
rank=rank,
evaluation_result_list=None))
# Distributed code: need to resume to this point.
# Skip the first update if it is a recovery step.
if version % 2 == 0:
bst.update(dtrain, i, obj)
bst.save_rabit_checkpoint()
version += 1
assert(rabit.get_world_size() == 1 or version == rabit.version_number())
nboost += 1
evaluation_result_list = []
# check evaluation result.
if len(evals) != 0:
bst_eval_set = bst.eval_set(evals, i, feval)
if isinstance(bst_eval_set, STRING_TYPES):
msg = bst_eval_set
else:
msg = bst_eval_set.decode()
res = [x.split(':') for x in msg.split()]
evaluation_result_list = [(k, float(v)) for k, v in res[1:]]
try:
for cb in callbacks_after_iter:
cb(CallbackEnv(model=bst,
cvfolds=None,
iteration=i,
begin_iteration=start_iteration,
end_iteration=num_boost_round,
rank=rank,
evaluation_result_list=evaluation_result_list))
except EarlyStopException:
break
# do checkpoint after evaluation, in case evaluation also updates booster.
bst.save_rabit_checkpoint()
version += 1
if bst.attr('best_score') is not None:
bst.best_score = float(bst.attr('best_score'))
bst.best_iteration = int(bst.attr('best_iteration'))
else:
bst.best_iteration = nboost - 1
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
return bst
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
maximize=False, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, xgb_model=None):
verbose_eval=True, learning_rates=None, xgb_model=None, callbacks=None):
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
"""Train a booster with given parameters.
@ -70,176 +172,37 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
xgb_model : file name of stored xgb model or 'Booster' instance
Xgb model to be loaded before training (allows training continuation).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
booster : a trained booster model
"""
evals = list(evals)
if isinstance(params, dict) \
and 'eval_metric' in params \
and isinstance(params['eval_metric'], list):
params = dict((k, v) for k, v in params.items())
eval_metrics = params['eval_metric']
params.pop("eval_metric", None)
params = list(params.items())
for eval_metric in eval_metrics:
params += [('eval_metric', eval_metric)]
callbacks = [] if callbacks is None else callbacks
bst = Booster(params, [dtrain] + [d[0] for d in evals])
nboost = 0
num_parallel_tree = 1
if isinstance(verbose_eval, bool):
verbose_eval_every_line = False
# Most of legacy advanced options becomes callbacks
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation())
else:
if isinstance(verbose_eval, int):
verbose_eval_every_line = verbose_eval
verbose_eval = True if verbose_eval_every_line > 0 else False
callbacks.append(callback.print_evaluation(verbose_eval))
if rabit.get_rank() != 0:
verbose_eval = False
if xgb_model is not None:
if not isinstance(xgb_model, STRING_TYPES):
xgb_model = xgb_model.save_raw()
bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
nboost = len(bst.get_dump())
else:
bst = Booster(params, [dtrain] + [d[0] for d in evals])
_params = dict(params) if isinstance(params, list) else params
_eta_param_name = 'eta' if 'eta' in _params else 'learning_rate'
if 'num_parallel_tree' in _params:
num_parallel_tree = _params['num_parallel_tree']
nboost //= num_parallel_tree
if 'num_class' in _params:
nboost //= _params['num_class']
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
maximize=maximize,
verbose=bool(verbose_eval)))
if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates))
if evals_result is not None:
if not isinstance(evals_result, dict):
raise TypeError('evals_result has to be a dictionary')
else:
evals_name = [d[1] for d in evals]
evals_result.clear()
evals_result.update(dict([(key, {}) for key in evals_name]))
callbacks.append(callback.record_evaluation(evals_result))
# early stopping
if early_stopping_rounds is not None:
if len(evals) < 1:
raise ValueError('For early stopping you need at least one set in evals.')
if verbose_eval:
rabit.tracker_print("Will train until {} error hasn't decreased in {} rounds.\n".format(
evals[-1][1], early_stopping_rounds))
# is params a list of tuples? are we using multiple eval metrics?
if isinstance(params, list):
if len(params) != len(dict(params).items()):
params = dict(params)
msg = ("Multiple eval metrics have been passed: "
"'{0}' will be used for early stopping.\n\n")
rabit.tracker_print(msg.format(params['eval_metric']))
else:
params = dict(params)
# either minimize loss or maximize AUC/MAP/NDCG
maximize_score = False
if 'eval_metric' in params:
maximize_metrics = ('auc', 'map', 'ndcg')
if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
maximize_score = True
if feval is not None:
maximize_score = maximize
if maximize_score:
bst.set_attr(best_score='0.0')
else:
bst.set_attr(best_score='inf')
bst.set_attr(best_iteration='0')
if isinstance(learning_rates, list) and len(learning_rates) != num_boost_round:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
# Distributed code: Load the checkpoint from rabit.
version = bst.load_rabit_checkpoint()
assert(rabit.get_world_size() != 1 or version == 0)
start_iteration = int(version / 2)
nboost += start_iteration
for i in range(start_iteration, num_boost_round):
if learning_rates is not None:
if isinstance(learning_rates, list):
bst.set_param(_eta_param_name, learning_rates[i])
else:
bst.set_param(_eta_param_name, learning_rates(i, num_boost_round))
# Distributed code: need to resume to this point.
# Skip the first update if it is a recovery step.
if version % 2 == 0:
bst.update(dtrain, i, obj)
bst.save_rabit_checkpoint()
version += 1
assert(rabit.get_world_size() == 1 or version == rabit.version_number())
nboost += 1
# check evaluation result.
if len(evals) != 0:
bst_eval_set = bst.eval_set(evals, i, feval)
if isinstance(bst_eval_set, STRING_TYPES):
msg = bst_eval_set
else:
msg = bst_eval_set.decode()
if verbose_eval:
if verbose_eval_every_line:
if i % verbose_eval_every_line == 0 or i == num_boost_round - 1:
rabit.tracker_print(msg + '\n')
else:
rabit.tracker_print(msg + '\n')
if evals_result is not None:
res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
for key in evals_name:
evals_idx = evals_name.index(key)
res_per_eval = len(res) // len(evals_name)
for r in range(res_per_eval):
res_item = res[(evals_idx * res_per_eval) + r]
res_key = res_item[0]
res_val = res_item[1]
if res_key in evals_result[key]:
evals_result[key][res_key].append(res_val)
else:
evals_result[key][res_key] = [res_val]
if early_stopping_rounds:
score = float(msg.rsplit(':', 1)[1])
best_score = float(bst.attr('best_score'))
best_iteration = int(bst.attr('best_iteration'))
if (maximize_score and score > best_score) or \
(not maximize_score and score < best_score):
# save the property to attributes, so they will occur in checkpoint.
bst.set_attr(best_score=str(score),
best_iteration=str(nboost - 1),
best_msg=msg)
elif i - best_iteration >= early_stopping_rounds:
best_msg = bst.attr('best_msg')
if verbose_eval:
msg = "Stopping. Best iteration:\n{}\n\n"
rabit.tracker_print(msg.format(best_msg))
break
# do checkpoint after evaluation, in case evaluation also updates booster.
bst.save_rabit_checkpoint()
version += 1
if early_stopping_rounds:
bst.best_score = float(bst.attr('best_score'))
bst.best_iteration = int(bst.attr('best_iteration'))
else:
bst.best_iteration = nboost - 1
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
return bst
return _train_internal(params, dtrain,
num_boost_round=num_boost_round,
evals=evals,
obj=obj, feval=feval,
xgb_model=xgb_model, callbacks=callbacks)
class CVPack(object):
@ -294,7 +257,7 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
return ret
def aggcv(rlist, show_stdv=True, verbose_eval=None, as_pandas=True, trial=0):
def aggcv(rlist):
# pylint: disable=invalid-name
"""
Aggregate cross-validation results.
@ -315,50 +278,21 @@ def aggcv(rlist, show_stdv=True, verbose_eval=None, as_pandas=True, trial=0):
if k not in cvmap:
cvmap[k] = []
cvmap[k].append(float(v))
msg = idx
if show_stdv:
fmt = '\tcv-{0}:{1}+{2}'
else:
fmt = '\tcv-{0}:{1}'
index = []
results = []
for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
for k, v in sorted(cvmap.items(), key=lambda x: (x[0].startswith('test'), x[0])):
v = np.array(v)
if not isinstance(msg, STRING_TYPES):
msg = msg.decode()
mean, std = np.mean(v), np.std(v)
msg += fmt.format(k, mean, std)
index.extend([k + '-mean', k + '-std'])
results.extend([mean, std])
if as_pandas:
try:
import pandas as pd
results = pd.Series(results, index=index)
except ImportError:
if verbose_eval is None:
verbose_eval = True
else:
# if verbose_eval is default (None),
# result will be np.ndarray as it can't hold column name
if verbose_eval is None:
verbose_eval = True
if (isinstance(verbose_eval, int) and verbose_eval > 0 and trial % verbose_eval == 0) or \
(isinstance(verbose_eval, bool) and verbose_eval):
sys.stderr.write(msg + '\n')
sys.stderr.flush()
results.extend([(k, mean, std)])
return results
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0):
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
@ -404,6 +338,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
Results are not affected, and always contains std.
seed : int
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
@ -431,59 +367,63 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
params.pop("eval_metric", None)
if early_stopping_rounds is not None:
if len(metrics) > 1:
msg = ('Check your params. '
'Early stopping works with single eval metric only.')
raise ValueError(msg)
if verbose_eval:
msg = "Will train until cv error hasn't decreased in {} rounds.\n"
sys.stderr.write(msg.format(early_stopping_rounds))
maximize_score = False
if len(metrics) == 1:
maximize_metrics = ('auc', 'map', 'ndcg')
if any(metrics[0].startswith(x) for x in maximize_metrics):
maximize_score = True
if feval is not None:
maximize_score = maximize
if maximize_score:
best_score = 0.0
else:
best_score = float('inf')
best_score_i = 0
results = []
results = {}
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds)
# setup callbacks
callbacks = [] if callbacks is None else callbacks
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
maximize=maximize,
verbose=False))
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(CallbackEnv(model=None,
cvfolds=cvfolds,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
rank=0,
evaluation_result_list=None))
for fold in cvfolds:
fold.update(i, obj)
res = aggcv([f.eval(i, feval) for f in cvfolds],
show_stdv=show_stdv, verbose_eval=verbose_eval,
as_pandas=as_pandas, trial=i)
results.append(res)
res = aggcv([f.eval(i, feval) for f in cvfolds])
if early_stopping_rounds is not None:
score = res[0]
if (maximize_score and score > best_score) or \
(not maximize_score and score < best_score):
best_score = score
best_score_i = i
elif i - best_score_i >= early_stopping_rounds:
results = results[:best_score_i + 1]
if verbose_eval:
msg = "Stopping. Best iteration:\n[{}] cv-mean:{}\tcv-std:{}\n"
sys.stderr.write(msg.format(best_score_i, results[-1][0], results[-1][1]))
break
for key, mean, std in res:
if key + '-mean' not in results:
results[key + '-mean'] = []
if key + '-std' not in results:
results[key + '-std'] = []
results[key + '-mean'].append(mean)
results[key + '-std'].append(std)
try:
for cb in callbacks_after_iter:
cb(CallbackEnv(model=None,
cvfolds=cvfolds,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
rank=0,
evaluation_result_list=res))
except EarlyStopException as e:
for k in results.keys():
results[k] = results[k][:(e.best_iteration + 1)]
break
if as_pandas:
try:
import pandas as pd
results = pd.DataFrame(results)
results = pd.DataFrame.from_dict(results)
except ImportError:
results = np.array(results)
else:
results = np.array(results)
pass
return results

2
rabit

@ -1 +1 @@
Subproject commit e19fced5cbd4e41b10099facae7caa5cd3e6ada3
Subproject commit 8f61535b83e650331459d7f33a1615fa7d27b7bd

View File

@ -271,6 +271,7 @@ void CLIDump2Text(const CLIParam& param) {
std::unique_ptr<Learner> learner(Learner::Create({}));
std::unique_ptr<dmlc::Stream> fi(
dmlc::Stream::Create(param.model_in.c_str(), "r"));
learner->Configure(param.cfg);
learner->Load(fi.get());
// dump data
std::vector<std::string> dump = learner->Dump2Text(fmap, param.dump_stats);
@ -297,6 +298,7 @@ void CLIPredict(const CLIParam& param) {
std::unique_ptr<Learner> learner(Learner::Create({}));
std::unique_ptr<dmlc::Stream> fi(
dmlc::Stream::Create(param.model_in.c_str(), "r"));
learner->Configure(param.cfg);
learner->Load(fi.get());
if (param.silent == 0) {

View File

@ -223,6 +223,10 @@ DMatrix* DMatrix::Load(const std::string& uri,
LOG(CONSOLE) << info.base_margin.size()
<< " base_margin are loaded from " << fname << ".base_margin";
}
if (MetaTryLoadFloatInfo(fname + ".weight", &info.weights) && !silent) {
LOG(CONSOLE) << info.weights.size()
<< " weights are loaded from " << fname << ".weight";
}
}
return dmat;
}

View File

@ -256,41 +256,44 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
name_shards.push_back(prefix + ".col.page");
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second);
}
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
double tstart = dmlc::GetTime();
size_t bytes_write = 0;
// print every 4 sec.
const double kStep = 4.0;
size_t tick_expected = kStep;
{
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
while (make_next_col(page.get())) {
for (size_t i = 0; i < page->Size(); ++i) {
col_size_[i] += page->offset[i + 1] - page->offset[i];
}
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) {
LOG(CONSOLE) << "Writing col.page file to " << cache_info_
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " MB writen";
tick_expected += kStep;
double tstart = dmlc::GetTime();
size_t bytes_write = 0;
// print every 4 sec.
const double kStep = 4.0;
size_t tick_expected = kStep;
while (make_next_col(page.get())) {
for (size_t i = 0; i < page->Size(); ++i) {
col_size_[i] += page->offset[i + 1] - page->offset[i];
}
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) {
LOG(CONSOLE) << "Writing col.page file to " << cache_info_
<< " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " MB writen";
tick_expected += kStep;
}
}
// save meta data
std::string col_meta_name = cache_shards[0] + ".col.meta";
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(col_meta_name.c_str(), "w"));
fo->Write(buffered_rowset_);
fo->Write(col_size_);
fo.reset(nullptr);
}
// save meta data
std::string col_meta_name = cache_shards[0] + ".col.meta";
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(col_meta_name.c_str(), "w"));
fo->Write(buffered_rowset_);
fo->Write(col_size_);
fo.reset(nullptr);
// initialize column data
CHECK(TryInitColData());
}

View File

@ -110,58 +110,60 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
name_shards.push_back(prefix + ".row.page");
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
}
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
{
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
MetaInfo info;
size_t bytes_write = 0;
double tstart = dmlc::GetTime();
// print every 4 sec.
const double kStep = 4.0;
size_t tick_expected = kStep;
MetaInfo info;
size_t bytes_write = 0;
double tstart = dmlc::GetTime();
// print every 4 sec.
const double kStep = 4.0;
size_t tick_expected = kStep;
while (src->Next()) {
const dmlc::RowBlock<uint32_t>& batch = src->Value();
if (batch.label != nullptr) {
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
}
if (batch.weight != nullptr) {
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
}
info.num_row += batch.size;
info.num_nonzero += batch.offset[batch.size] - batch.offset[0];
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
uint32_t index = batch.index[i];
info.num_col = std::max(info.num_col,
static_cast<uint64_t>(index + 1));
}
page->Push(batch);
if (page->MemCostBytes() >= kPageSize) {
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
while (src->Next()) {
const dmlc::RowBlock<uint32_t>& batch = src->Value();
if (batch.label != nullptr) {
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
}
if (batch.weight != nullptr) {
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
}
info.num_row += batch.size;
info.num_nonzero += batch.offset[batch.size] - batch.offset[0];
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
uint32_t index = batch.index[i];
info.num_col = std::max(info.num_col,
static_cast<uint64_t>(index + 1));
}
page->Push(batch);
if (page->MemCostBytes() >= kPageSize) {
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) {
LOG(CONSOLE) << "Writing row.page to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
tick_expected += kStep;
double tdiff = dmlc::GetTime() - tstart;
if (tdiff >= tick_expected) {
LOG(CONSOLE) << "Writing row.page to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
tick_expected += kStep;
}
}
}
}
if (page->data.size() != 0) {
writer.PushWrite(std::move(page));
}
if (page->data.size() != 0) {
writer.PushWrite(std::move(page));
}
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(name_info.c_str(), "w"));
int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo.get());
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(name_info.c_str(), "w"));
int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo.get());
}
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
}
@ -176,38 +178,39 @@ void SparsePageSource::Create(DMatrix* src,
name_shards.push_back(prefix + ".row.page");
format_shards.push_back(SparsePage::Format::DecideFormat(prefix).first);
}
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
{
SparsePage::Writer writer(name_shards, format_shards, 6);
std::unique_ptr<SparsePage> page;
writer.Alloc(&page); page->Clear();
MetaInfo info;
size_t bytes_write = 0;
double tstart = dmlc::GetTime();
dmlc::DataIter<RowBatch>* iter = src->RowIterator();
MetaInfo info;
size_t bytes_write = 0;
double tstart = dmlc::GetTime();
dmlc::DataIter<RowBatch>* iter = src->RowIterator();
while (iter->Next()) {
page->Push(iter->Value());
if (page->MemCostBytes() >= kPageSize) {
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
LOG(CONSOLE) << "Writing to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
while (iter->Next()) {
page->Push(iter->Value());
if (page->MemCostBytes() >= kPageSize) {
bytes_write += page->MemCostBytes();
writer.PushWrite(std::move(page));
writer.Alloc(&page);
page->Clear();
double tdiff = dmlc::GetTime() - tstart;
LOG(CONSOLE) << "Writing to " << cache_info << " in "
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
<< (bytes_write >> 20UL) << " written";
}
}
if (page->data.size() != 0) {
writer.PushWrite(std::move(page));
}
}
if (page->data.size() != 0) {
writer.PushWrite(std::move(page));
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(name_info.c_str(), "w"));
int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo.get());
}
std::unique_ptr<dmlc::Stream> fo(
dmlc::Stream::Create(name_info.c_str(), "w"));
int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo.get());
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
}

View File

@ -34,6 +34,7 @@ SparsePage::Writer::Writer(
fo->Write(format_shard);
std::unique_ptr<SparsePage> page;
while (wqueue->Pop(&page)) {
if (page.get() == nullptr) break;
fmt->Write(*page, fo.get());
qrecycle_.Push(std::move(page));
}
@ -45,7 +46,9 @@ SparsePage::Writer::Writer(
SparsePage::Writer::~Writer() {
for (auto& queue : qworkers_) {
queue.SignalForKill();
// use nullptr to signal termination.
std::unique_ptr<SparsePage> sig(nullptr);
queue.Push(std::move(sig));
}
for (auto& thread : workers_) {
thread->join();

View File

@ -17,6 +17,8 @@
#include <limits>
#include "../common/common.h"
#include "../common/random.h"
namespace xgboost {
namespace gbm {
@ -47,6 +49,42 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
}
};
/*! \brief training parameters */
struct DartTrainParam : public dmlc::Parameter<DartTrainParam> {
/*! \brief whether to not print info during training */
bool silent;
/*! \brief type of sampling algorithm */
int sample_type;
/*! \brief type of normalization algorithm */
int normalize_type;
/*! \brief how many trees are dropped */
float rate_drop;
/*! \brief whether to drop trees */
float skip_drop;
/*! \brief learning step size for a time */
float learning_rate;
// declare parameters
DMLC_DECLARE_PARAMETER(DartTrainParam) {
DMLC_DECLARE_FIELD(silent).set_default(false)
.describe("Not print information during trainig.");
DMLC_DECLARE_FIELD(sample_type).set_default(0)
.add_enum("uniform", 0)
.add_enum("weighted", 1)
.describe("Different types of sampling algorithm.");
DMLC_DECLARE_FIELD(normalize_type).set_default(0)
.add_enum("tree", 0)
.add_enum("forest", 1)
.describe("Different types of normalization algorithm.");
DMLC_DECLARE_FIELD(rate_drop).set_range(0.0f, 1.0f).set_default(0.0f)
.describe("Parameter of how many trees are dropped.");
DMLC_DECLARE_FIELD(skip_drop).set_range(0.0f, 1.0f).set_default(0.0f)
.describe("Parameter of whether to drop trees.");
DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(0.3f)
.describe("Learning rate(step size) of update.");
DMLC_DECLARE_ALIAS(learning_rate, eta);
}
};
/*! \brief model parameters */
struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
/*! \brief number of trees */
@ -313,8 +351,9 @@ class GBTree : public GradientBooster {
}
}
// commit new trees all at once
inline void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
virtual void
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(std::move(new_trees[i]));
tree_info.push_back(bst_group);
@ -475,14 +514,236 @@ class GBTree : public GradientBooster {
std::vector<std::unique_ptr<TreeUpdater> > updaters;
};
// dart
class Dart : public GBTree {
public:
Dart() {}
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
GBTree::Configure(cfg);
if (trees.size() == 0) {
dparam.InitAllowUnknown(cfg);
}
}
void Load(dmlc::Stream* fi) override {
GBTree::Load(fi);
weight_drop.resize(mparam.num_trees);
if (mparam.num_trees != 0) {
fi->Read(&weight_drop);
}
}
void Save(dmlc::Stream* fo) const override {
GBTree::Save(fo);
if (weight_drop.size() != 0) {
fo->Write(weight_drop);
}
}
// predict the leaf scores with dropout if ntree_limit = 0
void Predict(DMatrix* p_fmat,
int64_t buffer_offset,
std::vector<float>* out_preds,
unsigned ntree_limit) override {
DropTrees(ntree_limit);
const MetaInfo& info = p_fmat->info();
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
InitThreadTemp(nthread);
std::vector<float> &preds = *out_preds;
const size_t stride = p_fmat->info().num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
// parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num();
RegTree::FVec &feats = thread_temp[tid];
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
CHECK_LT(static_cast<size_t>(ridx), info.num_row);
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->Pred(batch[i],
buffer_offset < 0 ? -1 : buffer_offset + ridx,
gid, info.GetRoot(ridx), &feats,
&preds[ridx * mparam.num_output_group + gid], stride,
ntree_limit);
}
}
}
}
void Predict(const SparseBatch::Inst& inst,
std::vector<float>* out_preds,
unsigned ntree_limit,
unsigned root_index) override {
DropTrees(1);
if (thread_temp.size() == 0) {
thread_temp.resize(1, RegTree::FVec());
thread_temp[0].Init(mparam.num_feature);
}
out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->Pred(inst, -1, gid, root_index, &thread_temp[0],
&(*out_preds)[gid], mparam.num_output_group,
ntree_limit);
}
}
protected:
// commit new trees all at once
virtual void
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(std::move(new_trees[i]));
tree_info.push_back(bst_group);
}
mparam.num_trees += static_cast<int>(new_trees.size());
size_t num_drop = NormalizeTrees(new_trees.size());
if (dparam.silent != 1) {
LOG(INFO) << "drop " << num_drop << " trees, "
<< "weight = " << weight_drop.back();
}
}
// predict the leaf scores without dropped trees
inline void Pred(const RowBatch::Inst &inst,
int64_t buffer_index,
int bst_group,
unsigned root_index,
RegTree::FVec *p_feats,
float *out_pred,
size_t stride,
unsigned ntree_limit) {
float psum = 0.0f;
// sum of leaf vector
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
const int64_t bid = this->BufferOffset(buffer_index, bst_group);
p_feats->Fill(inst);
for (size_t i = 0; i < trees.size(); ++i) {
if (tree_info[i] == bst_group) {
bool drop = (std::find(idx_drop.begin(), idx_drop.end(), i) != idx_drop.end());
if (!drop) {
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
psum += weight_drop[i] * (*trees[i])[tid].leaf_value();
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
vec_psum[j] += weight_drop[i] * trees[i]->leafvec(tid)[j];
}
}
}
}
p_feats->Drop(inst);
// updated the buffered results
if (bid >= 0 && ntree_limit == 0) {
pred_counter[bid] = static_cast<unsigned>(trees.size());
pred_buffer[bid] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
pred_buffer[bid + i + 1] = vec_psum[i];
}
}
out_pred[0] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
out_pred[stride * (i + 1)] = vec_psum[i];
}
}
// select dropped trees
inline void DropTrees(unsigned ntree_limit_drop) {
std::uniform_real_distribution<> runif(0.0, 1.0);
auto& rnd = common::GlobalRandom();
// reset
idx_drop.clear();
// sample dropped trees
bool skip = false;
if (dparam.skip_drop > 0.0) skip = (runif(rnd) < dparam.skip_drop);
if (ntree_limit_drop == 0 && !skip) {
if (dparam.sample_type == 1) {
float sum_weight = 0.0;
for (size_t i = 0; i < weight_drop.size(); ++i) {
sum_weight += weight_drop[i];
}
for (size_t i = 0; i < weight_drop.size(); ++i) {
if (runif(rnd) < dparam.rate_drop * weight_drop.size() * weight_drop[i] / sum_weight) {
idx_drop.push_back(i);
}
}
} else {
for (size_t i = 0; i < weight_drop.size(); ++i) {
if (runif(rnd) < dparam.rate_drop) {
idx_drop.push_back(i);
}
}
}
}
}
// set normalization factors
inline size_t NormalizeTrees(size_t size_new_trees) {
float lr = 1.0 * dparam.learning_rate / size_new_trees;
size_t num_drop = idx_drop.size();
if (num_drop == 0) {
for (size_t i = 0; i < size_new_trees; ++i) {
weight_drop.push_back(1.0);
}
} else {
if (dparam.normalize_type == 1) {
// normalize_type 1
float factor = 1.0 / (1.0 + lr);
for (size_t i = 0; i < idx_drop.size(); ++i) {
weight_drop[i] *= factor;
}
for (size_t i = 0; i < size_new_trees; ++i) {
weight_drop.push_back(lr * factor);
}
} else {
// normalize_type 0
float factor = 1.0 * num_drop / (num_drop + lr);
for (size_t i = 0; i < idx_drop.size(); ++i) {
weight_drop[i] *= factor;
}
for (size_t i = 0; i < size_new_trees; ++i) {
weight_drop.push_back(1.0 * lr / (num_drop + lr));
}
}
}
// reset
idx_drop.clear();
return num_drop;
}
// --- data structure ---
// training parameter
DartTrainParam dparam;
/*! \brief prediction buffer */
std::vector<float> weight_drop;
// indexes of dropped trees
std::vector<size_t> idx_drop;
};
// register the ojective functions
DMLC_REGISTER_PARAMETER(GBTreeModelParam);
DMLC_REGISTER_PARAMETER(GBTreeTrainParam);
DMLC_REGISTER_PARAMETER(DartTrainParam);
XGBOOST_REGISTER_GBM(GBTree, "gbtree")
.describe("Tree booster, gradient boosted trees.")
.set_body([]() {
return new GBTree();
});
XGBOOST_REGISTER_GBM(Dart, "dart")
.describe("Tree booster, dart.")
.set_body([]() {
return new Dart();
});
} // namespace gbm
} // namespace xgboost

View File

@ -35,6 +35,22 @@ class TestBasic(unittest.TestCase):
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
def test_record_results(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
result = {}
res2 = {}
xgb.train(param, dtrain, num_round, watchlist,
callbacks=[xgb.callback.record_evaluation(result)])
xgb.train(param, dtrain, num_round, watchlist,
evals_result=res2)
assert result['train']['error'][0] < 0.1
assert res2 == result
def test_multiclass(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
@ -189,5 +205,5 @@ class TestBasic(unittest.TestCase):
# return np.ndarray
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
assert isinstance(cv, np.ndarray)
assert cv.shape == (10, 4)
assert isinstance(cv, dict)
assert len(cv) == (4)

View File

@ -23,6 +23,51 @@ class TestModels(unittest.TestCase):
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
def test_dart(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
param = {'max_depth': 5, 'objective': 'binary:logistic', 'booster': 'dart', 'silent': False}
# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dtest, ntree_limit=num_round)
labels = dtest.get_label()
err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
# error must be smaller than 10%
assert err < 0.1
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
# save model
bst.save_model('xgb.model.dart')
# load model and data in
bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2, ntree_limit=num_round)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
# check whether sample_type and normalize_type work
num_round = 50
param['silent'] = True
param['learning_rate'] = 0.1
param['rate_drop'] = 0.1
preds_list = []
for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]:
param['sample_type'] = p[0]
param['normalize_type'] = p[1]
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest, ntree_limit=num_round)
err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
preds_list.append(preds)
for ii in range(len(preds_list)):
for jj in range(ii + 1, len(preds_list)):
assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
def test_eta_decay(self):
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4

View File

@ -1,5 +1,5 @@
import xgboost as xgb
import xgboost.testing as tm
import testing as tm
import numpy as np
import unittest

View File

@ -1,5 +1,5 @@
import xgboost as xgb
import xgboost.testing as tm
import testing as tm
import numpy as np
import unittest

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import numpy as np
import xgboost as xgb
import xgboost.testing as tm
import testing as tm
import unittest
try:

View File

@ -1,5 +1,5 @@
import xgboost as xgb
import xgboost.testing as tm
import testing as tm
import numpy as np
import unittest

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import numpy as np
import xgboost as xgb
import xgboost.testing as tm
import testing as tm
import unittest
try:

View File

@ -1,7 +1,7 @@
import numpy as np
import random
import xgboost as xgb
import xgboost.testing as tm
import testing as tm
rng = np.random.RandomState(1994)

View File

@ -17,6 +17,6 @@ def _skip_if_no_pandas():
def _skip_if_no_matplotlib():
try:
import matplotlib.pyplot as plt # noqa
import matplotlib.pyplot as _ # noqa
except ImportError:
raise nose.SkipTest()