[doc] Include dask examples into doc. (#7530)
This commit is contained in:
parent
54582f641a
commit
ec56d5869b
@ -1,6 +0,0 @@
|
|||||||
Dask
|
|
||||||
====
|
|
||||||
|
|
||||||
This directory contains some demonstrations for using `dask` with `XGBoost`.
|
|
||||||
For an overview, see
|
|
||||||
https://xgboost.readthedocs.io/en/latest/tutorials/dask.html .
|
|
||||||
5
demo/dask/README.rst
Normal file
5
demo/dask/README.rst
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
XGBoost Dask Feature Walkthrough
|
||||||
|
================================
|
||||||
|
|
||||||
|
This directory contains some demonstrations for using `dask` with `XGBoost`. For an
|
||||||
|
overview, see :doc:`/tutorials/dask`
|
||||||
@ -1,3 +1,9 @@
|
|||||||
|
"""
|
||||||
|
Example of training survival model with Dask on CPU
|
||||||
|
===================================================
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import os
|
import os
|
||||||
from xgboost.dask import DaskDMatrix
|
from xgboost.dask import DaskDMatrix
|
||||||
|
|||||||
@ -1,3 +1,8 @@
|
|||||||
|
"""
|
||||||
|
Example of training with Dask on CPU
|
||||||
|
====================================
|
||||||
|
|
||||||
|
"""
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost.dask import DaskDMatrix
|
from xgboost.dask import DaskDMatrix
|
||||||
from dask.distributed import Client
|
from dask.distributed import Client
|
||||||
|
|||||||
@ -1,4 +1,7 @@
|
|||||||
"""Example of using callbacks in Dask"""
|
"""
|
||||||
|
Example of using callbacks with Dask
|
||||||
|
====================================
|
||||||
|
"""
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost.dask import DaskDMatrix
|
from xgboost.dask import DaskDMatrix
|
||||||
@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Example of training with Dask on GPU
|
||||||
|
====================================
|
||||||
|
"""
|
||||||
from dask_cuda import LocalCUDACluster
|
from dask_cuda import LocalCUDACluster
|
||||||
from dask.distributed import Client
|
from dask.distributed import Client
|
||||||
from dask import array as da
|
from dask import array as da
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
'''Dask interface demo:
|
"""
|
||||||
|
Use scikit-learn regressor interface with CPU histogram tree method
|
||||||
Use scikit-learn regressor interface with CPU histogram tree method.'''
|
===================================================================
|
||||||
|
"""
|
||||||
from dask.distributed import Client
|
from dask.distributed import Client
|
||||||
from dask.distributed import LocalCluster
|
from dask.distributed import LocalCluster
|
||||||
from dask import array as da
|
from dask import array as da
|
||||||
@ -16,7 +17,7 @@ def main(client):
|
|||||||
y = da.random.random(m, partition_size)
|
y = da.random.random(m, partition_size)
|
||||||
|
|
||||||
regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
|
regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
|
||||||
regressor.set_params(tree_method='hist')
|
regressor.set_params(tree_method="hist")
|
||||||
# assigning client here is optional
|
# assigning client here is optional
|
||||||
regressor.client = client
|
regressor.client = client
|
||||||
|
|
||||||
@ -26,13 +27,13 @@ def main(client):
|
|||||||
bst = regressor.get_booster()
|
bst = regressor.get_booster()
|
||||||
history = regressor.evals_result()
|
history = regressor.evals_result()
|
||||||
|
|
||||||
print('Evaluation history:', history)
|
print("Evaluation history:", history)
|
||||||
# returned prediction is always a dask array.
|
# returned prediction is always a dask array.
|
||||||
assert isinstance(prediction, da.Array)
|
assert isinstance(prediction, da.Array)
|
||||||
return bst # returning the trained model
|
return bst # returning the trained model
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
# or use other clusters for scaling
|
# or use other clusters for scaling
|
||||||
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
|
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
|
||||||
with Client(cluster) as client:
|
with Client(cluster) as client:
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
'''Dask interface demo:
|
"""
|
||||||
|
Use scikit-learn regressor interface with GPU histogram tree method
|
||||||
Use scikit-learn regressor interface with GPU histogram tree method.'''
|
===================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
from dask.distributed import Client
|
from dask.distributed import Client
|
||||||
# It's recommended to use dask_cuda for GPU assignment
|
# It's recommended to use dask_cuda for GPU assignment
|
||||||
|
|||||||
10
doc/c.rst
10
doc/c.rst
@ -2,11 +2,11 @@
|
|||||||
XGBoost C Package
|
XGBoost C Package
|
||||||
#################
|
#################
|
||||||
|
|
||||||
XGBoost implements a set of C API designed for various bindings, we maintain its
|
XGBoost implements a set of C API designed for various bindings, we maintain its stability
|
||||||
stability and the CMake/make build interface. See ``demo/c-api/README.md`` for an
|
and the CMake/make build interface. See :doc:`/tutorials/c_api_tutorial` for an
|
||||||
overview and related examples. Also one can generate doxygen document by providing
|
introduction and ``demo/c-api/`` for related examples. Also one can generate doxygen
|
||||||
``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or simply look at function
|
document by providing ``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or
|
||||||
comments in ``include/xgboost/c_api.h``.
|
simply look at function comments in ``include/xgboost/c_api.h``.
|
||||||
|
|
||||||
* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
|
* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
|
||||||
* `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
|
* `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
|
||||||
|
|||||||
12
doc/conf.py
12
doc/conf.py
@ -90,8 +90,10 @@ extensions = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
sphinx_gallery_conf = {
|
sphinx_gallery_conf = {
|
||||||
"examples_dirs": "../demo/guide-python", # path to your example scripts
|
# path to your example scripts
|
||||||
"gallery_dirs": "python/examples", # path to where to save gallery generated output
|
"examples_dirs": ["../demo/guide-python", "../demo/dask"],
|
||||||
|
# path to where to save gallery generated output
|
||||||
|
"gallery_dirs": ["python/examples", "python/dask-examples"],
|
||||||
"matplotlib_animations": True,
|
"matplotlib_animations": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -203,10 +205,10 @@ latex_documents = [
|
|||||||
|
|
||||||
intersphinx_mapping = {
|
intersphinx_mapping = {
|
||||||
"python": ("https://docs.python.org/3.6", None),
|
"python": ("https://docs.python.org/3.6", None),
|
||||||
"numpy": ("http://docs.scipy.org/doc/numpy/", None),
|
"numpy": ("https://docs.scipy.org/doc/numpy/", None),
|
||||||
"scipy": ("http://docs.scipy.org/doc/scipy/reference/", None),
|
"scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
|
||||||
"pandas": ("http://pandas-docs.github.io/pandas-docs-travis/", None),
|
"pandas": ("http://pandas-docs.github.io/pandas-docs-travis/", None),
|
||||||
"sklearn": ("http://scikit-learn.org/stable", None),
|
"sklearn": ("https://scikit-learn.org/stable", None),
|
||||||
"dask": ("https://docs.dask.org/en/stable/", None),
|
"dask": ("https://docs.dask.org/en/stable/", None),
|
||||||
"distributed": ("https://distributed.dask.org/en/stable/", None),
|
"distributed": ("https://distributed.dask.org/en/stable/", None),
|
||||||
}
|
}
|
||||||
|
|||||||
1
doc/python/.gitignore
vendored
1
doc/python/.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
examples
|
examples
|
||||||
|
dask-examples
|
||||||
@ -14,3 +14,4 @@ Contents
|
|||||||
callbacks
|
callbacks
|
||||||
model
|
model
|
||||||
examples/index
|
examples/index
|
||||||
|
dask-examples/index
|
||||||
|
|||||||
@ -63,15 +63,17 @@ on a dask cluster:
|
|||||||
evals=[(dtrain, "train")],
|
evals=[(dtrain, "train")],
|
||||||
)
|
)
|
||||||
|
|
||||||
Here we first create a cluster in single-node mode with ``dask.distributed.LocalCluster``, then
|
Here we first create a cluster in single-node mode with
|
||||||
connect a ``dask.distributed.Client`` to this cluster, setting up an environment for later
|
:py:class:`distributed.LocalCluster`, then connect a :py:class:`distributed.Client` to
|
||||||
computation. Notice that the cluster construction is guared by ``__name__ == "__main__"``, which is
|
this cluster, setting up an environment for later computation. Notice that the cluster
|
||||||
necessary otherwise there might be obscure errors.
|
construction is guared by ``__name__ == "__main__"``, which is necessary otherwise there
|
||||||
|
might be obscure errors.
|
||||||
|
|
||||||
We then create a ``DaskDMatrix`` object and pass it to ``train``, along with some other parameters,
|
We then create a :py:class:`xgboost.dask.DaskDMatrix` object and pass it to
|
||||||
much like XGBoost's normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must
|
:py:func:`xgboost.dask.train`, along with some other parameters, much like XGBoost's
|
||||||
be either `Dask DataFrame <https://examples.dask.org/dataframe.html>`_ or
|
normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must be either
|
||||||
`Dask Array <https://examples.dask.org/array.html>`_ instances.
|
:py:class:`Dask DataFrame <dask.dataframe.DataFrame>` or :py:class:`Dask Array
|
||||||
|
<dask.array.Array>` instances.
|
||||||
|
|
||||||
The primary difference with XGBoost's dask interface is
|
The primary difference with XGBoost's dask interface is
|
||||||
we pass our dask client as an additional argument for carrying out the computation. Note that if
|
we pass our dask client as an additional argument for carrying out the computation. Note that if
|
||||||
@ -86,7 +88,7 @@ returns a model and the computation history as a Python dictionary:
|
|||||||
{'booster': Booster,
|
{'booster': Booster,
|
||||||
'history': dict}
|
'history': dict}
|
||||||
|
|
||||||
For prediction, pass the ``output`` returned by ``train`` into ``xgb.dask.predict``:
|
For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -105,14 +107,15 @@ computation a bit faster when meta information like ``base_margin`` is not neede
|
|||||||
|
|
||||||
Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
|
Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
|
||||||
is a ``DaskDMatrix`` or ``da.Array``. When putting dask collection directly into the
|
is a ``DaskDMatrix`` or ``da.Array``. When putting dask collection directly into the
|
||||||
``predict`` function or using ``inplace_predict``, the output type depends on input data.
|
``predict`` function or using :py:func:`xgboost.dask.inplace_predict`, the output type
|
||||||
See next section for details.
|
depends on input data. See next section for details.
|
||||||
|
|
||||||
Alternatively, XGBoost also implements the Scikit-Learn interface with
|
Alternatively, XGBoost also implements the Scikit-Learn interface with
|
||||||
``DaskXGBClassifier``, ``DaskXGBRegressor``, ``DaskXGBRanker`` and 2 random forest
|
:py:class:`~xgboost.dask.DaskXGBClassifier`, :py:class:`~xgboost.dask.DaskXGBRegressor`,
|
||||||
variances. This wrapper is similar to the single node Scikit-Learn interface in xgboost,
|
:py:class:`~xgboost.dask.DaskXGBRanker` and 2 random forest variances. This wrapper is
|
||||||
with dask collection as inputs and has an additional ``client`` attribute. See following
|
similar to the single node Scikit-Learn interface in xgboost, with dask collection as
|
||||||
sections and ``xgboost/demo/dask`` for more examples.
|
inputs and has an additional ``client`` attribute. See following sections and
|
||||||
|
:ref:`sphx_glr_python_dask-examples` for more examples.
|
||||||
|
|
||||||
|
|
||||||
******************
|
******************
|
||||||
@ -152,7 +155,7 @@ depending on output shape. For example, when shap based prediction is used, the
|
|||||||
value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned.
|
value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned.
|
||||||
|
|
||||||
The performance of running prediction, either using ``predict`` or ``inplace_predict``, is
|
The performance of running prediction, either using ``predict`` or ``inplace_predict``, is
|
||||||
sensitive to number of blocks. Internally, it's implemented using ``da.map_blocks`` or
|
sensitive to number of blocks. Internally, it's implemented using ``da.map_blocks`` and
|
||||||
``dd.map_partitions``. When number of partitions is large and each of them have only
|
``dd.map_partitions``. When number of partitions is large and each of them have only
|
||||||
small amount of data, the overhead of calling predict becomes visible. On the other hand,
|
small amount of data, the overhead of calling predict becomes visible. On the other hand,
|
||||||
if not using GPU, the number of threads used for prediction on each block matters. Right
|
if not using GPU, the number of threads used for prediction on each block matters. Right
|
||||||
@ -160,7 +163,7 @@ now, xgboost uses single thread for each partition. If the number of blocks on
|
|||||||
workers is smaller than number of cores, then the CPU workers might not be fully utilized.
|
workers is smaller than number of cores, then the CPU workers might not be fully utilized.
|
||||||
|
|
||||||
One simple optimization for running consecutive predictions is using
|
One simple optimization for running consecutive predictions is using
|
||||||
``distributed.Future``:
|
:py:class:`distributed.Future`:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -504,8 +507,9 @@ Here are some pratices on reducing memory usage with dask and xgboost.
|
|||||||
nice summary.
|
nice summary.
|
||||||
|
|
||||||
- When using GPU input, like dataframe loaded by ``dask_cudf``, you can try
|
- When using GPU input, like dataframe loaded by ``dask_cudf``, you can try
|
||||||
``xgboost.dask.DaskDeviceQuantileDMatrix`` as a drop in replacement for ``DaskDMatrix``
|
:py:class:`xgboost.dask.DaskDeviceQuantileDMatrix` as a drop in replacement for ``DaskDMatrix``
|
||||||
to reduce overall memory usage. See ``demo/dask/gpu_training.py`` for an example.
|
to reduce overall memory usage. See
|
||||||
|
:ref:`sphx_glr_python_dask-examples_gpu_training.py` for an example.
|
||||||
|
|
||||||
- Use in-place prediction when possible.
|
- Use in-place prediction when possible.
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user