[doc] Include dask examples into doc. (#7530)

This commit is contained in:
Jiaming Yuan 2022-01-05 03:27:22 +08:00 committed by GitHub
parent 54582f641a
commit ec56d5869b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 74 additions and 47 deletions

View File

@ -1,6 +0,0 @@
Dask
====
This directory contains some demonstrations for using `dask` with `XGBoost`.
For an overview, see
https://xgboost.readthedocs.io/en/latest/tutorials/dask.html .

5
demo/dask/README.rst Normal file
View File

@ -0,0 +1,5 @@
XGBoost Dask Feature Walkthrough
================================
This directory contains some demonstrations for using `dask` with `XGBoost`. For an
overview, see :doc:`/tutorials/dask`

View File

@ -1,3 +1,9 @@
"""
Example of training survival model with Dask on CPU
===================================================
"""
import xgboost as xgb import xgboost as xgb
import os import os
from xgboost.dask import DaskDMatrix from xgboost.dask import DaskDMatrix

View File

@ -1,3 +1,8 @@
"""
Example of training with Dask on CPU
====================================
"""
import xgboost as xgb import xgboost as xgb
from xgboost.dask import DaskDMatrix from xgboost.dask import DaskDMatrix
from dask.distributed import Client from dask.distributed import Client

View File

@ -1,4 +1,7 @@
"""Example of using callbacks in Dask""" """
Example of using callbacks with Dask
====================================
"""
import numpy as np import numpy as np
import xgboost as xgb import xgboost as xgb
from xgboost.dask import DaskDMatrix from xgboost.dask import DaskDMatrix

View File

@ -1,3 +1,7 @@
"""
Example of training with Dask on GPU
====================================
"""
from dask_cuda import LocalCUDACluster from dask_cuda import LocalCUDACluster
from dask.distributed import Client from dask.distributed import Client
from dask import array as da from dask import array as da

View File

@ -1,6 +1,7 @@
'''Dask interface demo: """
Use scikit-learn regressor interface with CPU histogram tree method
Use scikit-learn regressor interface with CPU histogram tree method.''' ===================================================================
"""
from dask.distributed import Client from dask.distributed import Client
from dask.distributed import LocalCluster from dask.distributed import LocalCluster
from dask import array as da from dask import array as da
@ -16,7 +17,7 @@ def main(client):
y = da.random.random(m, partition_size) y = da.random.random(m, partition_size)
regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2) regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
regressor.set_params(tree_method='hist') regressor.set_params(tree_method="hist")
# assigning client here is optional # assigning client here is optional
regressor.client = client regressor.client = client
@ -26,13 +27,13 @@ def main(client):
bst = regressor.get_booster() bst = regressor.get_booster()
history = regressor.evals_result() history = regressor.evals_result()
print('Evaluation history:', history) print("Evaluation history:", history)
# returned prediction is always a dask array. # returned prediction is always a dask array.
assert isinstance(prediction, da.Array) assert isinstance(prediction, da.Array)
return bst # returning the trained model return bst # returning the trained model
if __name__ == '__main__': if __name__ == "__main__":
# or use other clusters for scaling # or use other clusters for scaling
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster: with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
with Client(cluster) as client: with Client(cluster) as client:

View File

@ -1,6 +1,7 @@
'''Dask interface demo: """
Use scikit-learn regressor interface with GPU histogram tree method
Use scikit-learn regressor interface with GPU histogram tree method.''' ===================================================================
"""
from dask.distributed import Client from dask.distributed import Client
# It's recommended to use dask_cuda for GPU assignment # It's recommended to use dask_cuda for GPU assignment

View File

@ -2,11 +2,11 @@
XGBoost C Package XGBoost C Package
################# #################
XGBoost implements a set of C API designed for various bindings, we maintain its XGBoost implements a set of C API designed for various bindings, we maintain its stability
stability and the CMake/make build interface. See ``demo/c-api/README.md`` for an and the CMake/make build interface. See :doc:`/tutorials/c_api_tutorial` for an
overview and related examples. Also one can generate doxygen document by providing introduction and ``demo/c-api/`` for related examples. Also one can generate doxygen
``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or simply look at function document by providing ``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or
comments in ``include/xgboost/c_api.h``. simply look at function comments in ``include/xgboost/c_api.h``.
* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_ * `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
* `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_ * `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_

View File

@ -90,8 +90,10 @@ extensions = [
] ]
sphinx_gallery_conf = { sphinx_gallery_conf = {
"examples_dirs": "../demo/guide-python", # path to your example scripts # path to your example scripts
"gallery_dirs": "python/examples", # path to where to save gallery generated output "examples_dirs": ["../demo/guide-python", "../demo/dask"],
# path to where to save gallery generated output
"gallery_dirs": ["python/examples", "python/dask-examples"],
"matplotlib_animations": True, "matplotlib_animations": True,
} }
@ -203,10 +205,10 @@ latex_documents = [
intersphinx_mapping = { intersphinx_mapping = {
"python": ("https://docs.python.org/3.6", None), "python": ("https://docs.python.org/3.6", None),
"numpy": ("http://docs.scipy.org/doc/numpy/", None), "numpy": ("https://docs.scipy.org/doc/numpy/", None),
"scipy": ("http://docs.scipy.org/doc/scipy/reference/", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
"pandas": ("http://pandas-docs.github.io/pandas-docs-travis/", None), "pandas": ("http://pandas-docs.github.io/pandas-docs-travis/", None),
"sklearn": ("http://scikit-learn.org/stable", None), "sklearn": ("https://scikit-learn.org/stable", None),
"dask": ("https://docs.dask.org/en/stable/", None), "dask": ("https://docs.dask.org/en/stable/", None),
"distributed": ("https://distributed.dask.org/en/stable/", None), "distributed": ("https://distributed.dask.org/en/stable/", None),
} }

View File

@ -1 +1,2 @@
examples examples
dask-examples

View File

@ -14,3 +14,4 @@ Contents
callbacks callbacks
model model
examples/index examples/index
dask-examples/index

View File

@ -63,15 +63,17 @@ on a dask cluster:
evals=[(dtrain, "train")], evals=[(dtrain, "train")],
) )
Here we first create a cluster in single-node mode with ``dask.distributed.LocalCluster``, then Here we first create a cluster in single-node mode with
connect a ``dask.distributed.Client`` to this cluster, setting up an environment for later :py:class:`distributed.LocalCluster`, then connect a :py:class:`distributed.Client` to
computation. Notice that the cluster construction is guared by ``__name__ == "__main__"``, which is this cluster, setting up an environment for later computation. Notice that the cluster
necessary otherwise there might be obscure errors. construction is guared by ``__name__ == "__main__"``, which is necessary otherwise there
might be obscure errors.
We then create a ``DaskDMatrix`` object and pass it to ``train``, along with some other parameters, We then create a :py:class:`xgboost.dask.DaskDMatrix` object and pass it to
much like XGBoost's normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must :py:func:`xgboost.dask.train`, along with some other parameters, much like XGBoost's
be either `Dask DataFrame <https://examples.dask.org/dataframe.html>`_ or normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must be either
`Dask Array <https://examples.dask.org/array.html>`_ instances. :py:class:`Dask DataFrame <dask.dataframe.DataFrame>` or :py:class:`Dask Array
<dask.array.Array>` instances.
The primary difference with XGBoost's dask interface is The primary difference with XGBoost's dask interface is
we pass our dask client as an additional argument for carrying out the computation. Note that if we pass our dask client as an additional argument for carrying out the computation. Note that if
@ -86,7 +88,7 @@ returns a model and the computation history as a Python dictionary:
{'booster': Booster, {'booster': Booster,
'history': dict} 'history': dict}
For prediction, pass the ``output`` returned by ``train`` into ``xgb.dask.predict``: For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:
.. code-block:: python .. code-block:: python
@ -105,14 +107,15 @@ computation a bit faster when meta information like ``base_margin`` is not neede
Here ``prediction`` is a dask ``Array`` object containing predictions from model if input Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
is a ``DaskDMatrix`` or ``da.Array``. When putting dask collection directly into the is a ``DaskDMatrix`` or ``da.Array``. When putting dask collection directly into the
``predict`` function or using ``inplace_predict``, the output type depends on input data. ``predict`` function or using :py:func:`xgboost.dask.inplace_predict`, the output type
See next section for details. depends on input data. See next section for details.
Alternatively, XGBoost also implements the Scikit-Learn interface with Alternatively, XGBoost also implements the Scikit-Learn interface with
``DaskXGBClassifier``, ``DaskXGBRegressor``, ``DaskXGBRanker`` and 2 random forest :py:class:`~xgboost.dask.DaskXGBClassifier`, :py:class:`~xgboost.dask.DaskXGBRegressor`,
variances. This wrapper is similar to the single node Scikit-Learn interface in xgboost, :py:class:`~xgboost.dask.DaskXGBRanker` and 2 random forest variances. This wrapper is
with dask collection as inputs and has an additional ``client`` attribute. See following similar to the single node Scikit-Learn interface in xgboost, with dask collection as
sections and ``xgboost/demo/dask`` for more examples. inputs and has an additional ``client`` attribute. See following sections and
:ref:`sphx_glr_python_dask-examples` for more examples.
****************** ******************
@ -152,7 +155,7 @@ depending on output shape. For example, when shap based prediction is used, the
value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned. value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned.
The performance of running prediction, either using ``predict`` or ``inplace_predict``, is The performance of running prediction, either using ``predict`` or ``inplace_predict``, is
sensitive to number of blocks. Internally, it's implemented using ``da.map_blocks`` or sensitive to number of blocks. Internally, it's implemented using ``da.map_blocks`` and
``dd.map_partitions``. When number of partitions is large and each of them have only ``dd.map_partitions``. When number of partitions is large and each of them have only
small amount of data, the overhead of calling predict becomes visible. On the other hand, small amount of data, the overhead of calling predict becomes visible. On the other hand,
if not using GPU, the number of threads used for prediction on each block matters. Right if not using GPU, the number of threads used for prediction on each block matters. Right
@ -160,7 +163,7 @@ now, xgboost uses single thread for each partition. If the number of blocks on
workers is smaller than number of cores, then the CPU workers might not be fully utilized. workers is smaller than number of cores, then the CPU workers might not be fully utilized.
One simple optimization for running consecutive predictions is using One simple optimization for running consecutive predictions is using
``distributed.Future``: :py:class:`distributed.Future`:
.. code-block:: python .. code-block:: python
@ -504,8 +507,9 @@ Here are some pratices on reducing memory usage with dask and xgboost.
nice summary. nice summary.
- When using GPU input, like dataframe loaded by ``dask_cudf``, you can try - When using GPU input, like dataframe loaded by ``dask_cudf``, you can try
``xgboost.dask.DaskDeviceQuantileDMatrix`` as a drop in replacement for ``DaskDMatrix`` :py:class:`xgboost.dask.DaskDeviceQuantileDMatrix` as a drop in replacement for ``DaskDMatrix``
to reduce overall memory usage. See ``demo/dask/gpu_training.py`` for an example. to reduce overall memory usage. See
:ref:`sphx_glr_python_dask-examples_gpu_training.py` for an example.
- Use in-place prediction when possible. - Use in-place prediction when possible.