[doc] Include dask examples into doc. (#7530)
This commit is contained in:
parent
54582f641a
commit
ec56d5869b
@ -1,6 +0,0 @@
|
||||
Dask
|
||||
====
|
||||
|
||||
This directory contains some demonstrations for using `dask` with `XGBoost`.
|
||||
For an overview, see
|
||||
https://xgboost.readthedocs.io/en/latest/tutorials/dask.html .
|
||||
5
demo/dask/README.rst
Normal file
5
demo/dask/README.rst
Normal file
@ -0,0 +1,5 @@
|
||||
XGBoost Dask Feature Walkthrough
|
||||
================================
|
||||
|
||||
This directory contains some demonstrations for using `dask` with `XGBoost`. For an
|
||||
overview, see :doc:`/tutorials/dask`
|
||||
@ -1,3 +1,9 @@
|
||||
"""
|
||||
Example of training survival model with Dask on CPU
|
||||
===================================================
|
||||
|
||||
"""
|
||||
|
||||
import xgboost as xgb
|
||||
import os
|
||||
from xgboost.dask import DaskDMatrix
|
||||
|
||||
@ -1,3 +1,8 @@
|
||||
"""
|
||||
Example of training with Dask on CPU
|
||||
====================================
|
||||
|
||||
"""
|
||||
import xgboost as xgb
|
||||
from xgboost.dask import DaskDMatrix
|
||||
from dask.distributed import Client
|
||||
|
||||
@ -1,4 +1,7 @@
|
||||
"""Example of using callbacks in Dask"""
|
||||
"""
|
||||
Example of using callbacks with Dask
|
||||
====================================
|
||||
"""
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from xgboost.dask import DaskDMatrix
|
||||
@ -1,3 +1,7 @@
|
||||
"""
|
||||
Example of training with Dask on GPU
|
||||
====================================
|
||||
"""
|
||||
from dask_cuda import LocalCUDACluster
|
||||
from dask.distributed import Client
|
||||
from dask import array as da
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
'''Dask interface demo:
|
||||
|
||||
Use scikit-learn regressor interface with CPU histogram tree method.'''
|
||||
"""
|
||||
Use scikit-learn regressor interface with CPU histogram tree method
|
||||
===================================================================
|
||||
"""
|
||||
from dask.distributed import Client
|
||||
from dask.distributed import LocalCluster
|
||||
from dask import array as da
|
||||
@ -16,7 +17,7 @@ def main(client):
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
|
||||
regressor.set_params(tree_method='hist')
|
||||
regressor.set_params(tree_method="hist")
|
||||
# assigning client here is optional
|
||||
regressor.client = client
|
||||
|
||||
@ -26,13 +27,13 @@ def main(client):
|
||||
bst = regressor.get_booster()
|
||||
history = regressor.evals_result()
|
||||
|
||||
print('Evaluation history:', history)
|
||||
print("Evaluation history:", history)
|
||||
# returned prediction is always a dask array.
|
||||
assert isinstance(prediction, da.Array)
|
||||
return bst # returning the trained model
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
# or use other clusters for scaling
|
||||
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
|
||||
with Client(cluster) as client:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
'''Dask interface demo:
|
||||
|
||||
Use scikit-learn regressor interface with GPU histogram tree method.'''
|
||||
"""
|
||||
Use scikit-learn regressor interface with GPU histogram tree method
|
||||
===================================================================
|
||||
"""
|
||||
|
||||
from dask.distributed import Client
|
||||
# It's recommended to use dask_cuda for GPU assignment
|
||||
|
||||
10
doc/c.rst
10
doc/c.rst
@ -2,11 +2,11 @@
|
||||
XGBoost C Package
|
||||
#################
|
||||
|
||||
XGBoost implements a set of C API designed for various bindings, we maintain its
|
||||
stability and the CMake/make build interface. See ``demo/c-api/README.md`` for an
|
||||
overview and related examples. Also one can generate doxygen document by providing
|
||||
``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or simply look at function
|
||||
comments in ``include/xgboost/c_api.h``.
|
||||
XGBoost implements a set of C API designed for various bindings, we maintain its stability
|
||||
and the CMake/make build interface. See :doc:`/tutorials/c_api_tutorial` for an
|
||||
introduction and ``demo/c-api/`` for related examples. Also one can generate doxygen
|
||||
document by providing ``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or
|
||||
simply look at function comments in ``include/xgboost/c_api.h``.
|
||||
|
||||
* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
|
||||
* `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
|
||||
|
||||
12
doc/conf.py
12
doc/conf.py
@ -90,8 +90,10 @@ extensions = [
|
||||
]
|
||||
|
||||
sphinx_gallery_conf = {
|
||||
"examples_dirs": "../demo/guide-python", # path to your example scripts
|
||||
"gallery_dirs": "python/examples", # path to where to save gallery generated output
|
||||
# path to your example scripts
|
||||
"examples_dirs": ["../demo/guide-python", "../demo/dask"],
|
||||
# path to where to save gallery generated output
|
||||
"gallery_dirs": ["python/examples", "python/dask-examples"],
|
||||
"matplotlib_animations": True,
|
||||
}
|
||||
|
||||
@ -203,10 +205,10 @@ latex_documents = [
|
||||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3.6", None),
|
||||
"numpy": ("http://docs.scipy.org/doc/numpy/", None),
|
||||
"scipy": ("http://docs.scipy.org/doc/scipy/reference/", None),
|
||||
"numpy": ("https://docs.scipy.org/doc/numpy/", None),
|
||||
"scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
|
||||
"pandas": ("http://pandas-docs.github.io/pandas-docs-travis/", None),
|
||||
"sklearn": ("http://scikit-learn.org/stable", None),
|
||||
"sklearn": ("https://scikit-learn.org/stable", None),
|
||||
"dask": ("https://docs.dask.org/en/stable/", None),
|
||||
"distributed": ("https://distributed.dask.org/en/stable/", None),
|
||||
}
|
||||
|
||||
1
doc/python/.gitignore
vendored
1
doc/python/.gitignore
vendored
@ -1 +1,2 @@
|
||||
examples
|
||||
dask-examples
|
||||
@ -14,3 +14,4 @@ Contents
|
||||
callbacks
|
||||
model
|
||||
examples/index
|
||||
dask-examples/index
|
||||
|
||||
@ -63,15 +63,17 @@ on a dask cluster:
|
||||
evals=[(dtrain, "train")],
|
||||
)
|
||||
|
||||
Here we first create a cluster in single-node mode with ``dask.distributed.LocalCluster``, then
|
||||
connect a ``dask.distributed.Client`` to this cluster, setting up an environment for later
|
||||
computation. Notice that the cluster construction is guared by ``__name__ == "__main__"``, which is
|
||||
necessary otherwise there might be obscure errors.
|
||||
Here we first create a cluster in single-node mode with
|
||||
:py:class:`distributed.LocalCluster`, then connect a :py:class:`distributed.Client` to
|
||||
this cluster, setting up an environment for later computation. Notice that the cluster
|
||||
construction is guared by ``__name__ == "__main__"``, which is necessary otherwise there
|
||||
might be obscure errors.
|
||||
|
||||
We then create a ``DaskDMatrix`` object and pass it to ``train``, along with some other parameters,
|
||||
much like XGBoost's normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must
|
||||
be either `Dask DataFrame <https://examples.dask.org/dataframe.html>`_ or
|
||||
`Dask Array <https://examples.dask.org/array.html>`_ instances.
|
||||
We then create a :py:class:`xgboost.dask.DaskDMatrix` object and pass it to
|
||||
:py:func:`xgboost.dask.train`, along with some other parameters, much like XGBoost's
|
||||
normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must be either
|
||||
:py:class:`Dask DataFrame <dask.dataframe.DataFrame>` or :py:class:`Dask Array
|
||||
<dask.array.Array>` instances.
|
||||
|
||||
The primary difference with XGBoost's dask interface is
|
||||
we pass our dask client as an additional argument for carrying out the computation. Note that if
|
||||
@ -86,7 +88,7 @@ returns a model and the computation history as a Python dictionary:
|
||||
{'booster': Booster,
|
||||
'history': dict}
|
||||
|
||||
For prediction, pass the ``output`` returned by ``train`` into ``xgb.dask.predict``:
|
||||
For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -105,14 +107,15 @@ computation a bit faster when meta information like ``base_margin`` is not neede
|
||||
|
||||
Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
|
||||
is a ``DaskDMatrix`` or ``da.Array``. When putting dask collection directly into the
|
||||
``predict`` function or using ``inplace_predict``, the output type depends on input data.
|
||||
See next section for details.
|
||||
``predict`` function or using :py:func:`xgboost.dask.inplace_predict`, the output type
|
||||
depends on input data. See next section for details.
|
||||
|
||||
Alternatively, XGBoost also implements the Scikit-Learn interface with
|
||||
``DaskXGBClassifier``, ``DaskXGBRegressor``, ``DaskXGBRanker`` and 2 random forest
|
||||
variances. This wrapper is similar to the single node Scikit-Learn interface in xgboost,
|
||||
with dask collection as inputs and has an additional ``client`` attribute. See following
|
||||
sections and ``xgboost/demo/dask`` for more examples.
|
||||
:py:class:`~xgboost.dask.DaskXGBClassifier`, :py:class:`~xgboost.dask.DaskXGBRegressor`,
|
||||
:py:class:`~xgboost.dask.DaskXGBRanker` and 2 random forest variances. This wrapper is
|
||||
similar to the single node Scikit-Learn interface in xgboost, with dask collection as
|
||||
inputs and has an additional ``client`` attribute. See following sections and
|
||||
:ref:`sphx_glr_python_dask-examples` for more examples.
|
||||
|
||||
|
||||
******************
|
||||
@ -152,7 +155,7 @@ depending on output shape. For example, when shap based prediction is used, the
|
||||
value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned.
|
||||
|
||||
The performance of running prediction, either using ``predict`` or ``inplace_predict``, is
|
||||
sensitive to number of blocks. Internally, it's implemented using ``da.map_blocks`` or
|
||||
sensitive to number of blocks. Internally, it's implemented using ``da.map_blocks`` and
|
||||
``dd.map_partitions``. When number of partitions is large and each of them have only
|
||||
small amount of data, the overhead of calling predict becomes visible. On the other hand,
|
||||
if not using GPU, the number of threads used for prediction on each block matters. Right
|
||||
@ -160,7 +163,7 @@ now, xgboost uses single thread for each partition. If the number of blocks on
|
||||
workers is smaller than number of cores, then the CPU workers might not be fully utilized.
|
||||
|
||||
One simple optimization for running consecutive predictions is using
|
||||
``distributed.Future``:
|
||||
:py:class:`distributed.Future`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -504,8 +507,9 @@ Here are some pratices on reducing memory usage with dask and xgboost.
|
||||
nice summary.
|
||||
|
||||
- When using GPU input, like dataframe loaded by ``dask_cudf``, you can try
|
||||
``xgboost.dask.DaskDeviceQuantileDMatrix`` as a drop in replacement for ``DaskDMatrix``
|
||||
to reduce overall memory usage. See ``demo/dask/gpu_training.py`` for an example.
|
||||
:py:class:`xgboost.dask.DaskDeviceQuantileDMatrix` as a drop in replacement for ``DaskDMatrix``
|
||||
to reduce overall memory usage. See
|
||||
:ref:`sphx_glr_python_dask-examples_gpu_training.py` for an example.
|
||||
|
||||
- Use in-place prediction when possible.
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user