diff --git a/demo/dask/README.md b/demo/dask/README.md
deleted file mode 100644
index b70248ca4..000000000
--- a/demo/dask/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-Dask
-====
-
-This directory contains some demonstrations for using `dask` with `XGBoost`.
-For an overview, see
-https://xgboost.readthedocs.io/en/latest/tutorials/dask.html .
\ No newline at end of file
diff --git a/demo/dask/README.rst b/demo/dask/README.rst
new file mode 100644
index 000000000..456425e91
--- /dev/null
+++ b/demo/dask/README.rst
@@ -0,0 +1,5 @@
+XGBoost Dask Feature Walkthrough
+================================
+
+This directory contains some demonstrations for using `dask` with `XGBoost`.  For an
+overview, see :doc:`/tutorials/dask`
diff --git a/demo/dask/cpu_survival.py b/demo/dask/cpu_survival.py
index c2b815074..c79f7d96c 100644
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@@ -1,3 +1,9 @@
+"""
+Example of training survival model with Dask on CPU
+===================================================
+
+"""
+
 import xgboost as xgb
 import os
 from xgboost.dask import DaskDMatrix
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
index b86958e8f..6ee91dafa 100644
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -1,3 +1,8 @@
+"""
+Example of training with Dask on CPU
+====================================
+
+"""
 import xgboost as xgb
 from xgboost.dask import DaskDMatrix
 from dask.distributed import Client
diff --git a/demo/dask/callbacks.py b/demo/dask/dask_callbacks.py
similarity index 97%
rename from demo/dask/callbacks.py
rename to demo/dask/dask_callbacks.py
index e3bf5b39d..64d7b0f28 100644
--- a/demo/dask/callbacks.py
+++ b/demo/dask/dask_callbacks.py
@@ -1,4 +1,7 @@
-"""Example of using callbacks in Dask"""
+"""
+Example of using callbacks with Dask
+====================================
+"""
 import numpy as np
 import xgboost as xgb
 from xgboost.dask import DaskDMatrix
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index 7284e042b..1752a59e9 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -1,3 +1,7 @@
+"""
+Example of training with Dask on GPU
+====================================
+"""
 from dask_cuda import LocalCUDACluster
 from dask.distributed import Client
 from dask import array as da
diff --git a/demo/dask/sklearn_cpu_training.py b/demo/dask/sklearn_cpu_training.py
index 0549aa3d4..69f5dc788 100644
--- a/demo/dask/sklearn_cpu_training.py
+++ b/demo/dask/sklearn_cpu_training.py
@@ -1,6 +1,7 @@
-'''Dask interface demo:
-
-Use scikit-learn regressor interface with CPU histogram tree method.'''
+"""
+Use scikit-learn regressor interface with CPU histogram tree method
+===================================================================
+"""
 from dask.distributed import Client
 from dask.distributed import LocalCluster
 from dask import array as da
@@ -16,7 +17,7 @@ def main(client):
     y = da.random.random(m, partition_size)
 
     regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
-    regressor.set_params(tree_method='hist')
+    regressor.set_params(tree_method="hist")
     # assigning client here is optional
     regressor.client = client
 
@@ -26,13 +27,13 @@ def main(client):
     bst = regressor.get_booster()
     history = regressor.evals_result()
 
-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
     # returned prediction is always a dask array.
     assert isinstance(prediction, da.Array)
-    return bst                  # returning the trained model
+    return bst  # returning the trained model
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # or use other clusters for scaling
     with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
         with Client(cluster) as client:
diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py
index afba21504..3031d9705 100644
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -1,6 +1,7 @@
-'''Dask interface demo:
-
-Use scikit-learn regressor interface with GPU histogram tree method.'''
+"""
+Use scikit-learn regressor interface with GPU histogram tree method
+===================================================================
+"""
 
 from dask.distributed import Client
 # It's recommended to use dask_cuda for GPU assignment
diff --git a/doc/c.rst b/doc/c.rst
index 2dc15269d..ee9dd8629 100644
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -2,11 +2,11 @@
 XGBoost C Package
 #################
 
-XGBoost implements a set of C API designed for various bindings, we maintain its
-stability and the CMake/make build interface.  See ``demo/c-api/README.md`` for an
-overview and related examples.  Also one can generate doxygen document by providing
-``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or simply look at function
-comments in ``include/xgboost/c_api.h``.
+XGBoost implements a set of C API designed for various bindings, we maintain its stability
+and the CMake/make build interface.  See :doc:`/tutorials/c_api_tutorial` for an
+introduction and ``demo/c-api/`` for related examples.  Also one can generate doxygen
+document by providing ``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or
+simply look at function comments in ``include/xgboost/c_api.h``.
 
 * `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
 * `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
diff --git a/doc/conf.py b/doc/conf.py
index a43d384fd..53b2ba503 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -90,8 +90,10 @@ extensions = [
 ]
 
 sphinx_gallery_conf = {
-    "examples_dirs": "../demo/guide-python",  # path to your example scripts
-    "gallery_dirs": "python/examples",  # path to where to save gallery generated output
+    # path to your example scripts
+    "examples_dirs": ["../demo/guide-python", "../demo/dask"],
+    # path to where to save gallery generated output
+    "gallery_dirs": ["python/examples", "python/dask-examples"],
     "matplotlib_animations": True,
 }
 
@@ -203,10 +205,10 @@ latex_documents = [
 
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3.6", None),
-    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
-    "scipy": ("http://docs.scipy.org/doc/scipy/reference/", None),
+    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
     "pandas": ("http://pandas-docs.github.io/pandas-docs-travis/", None),
-    "sklearn": ("http://scikit-learn.org/stable", None),
+    "sklearn": ("https://scikit-learn.org/stable", None),
     "dask": ("https://docs.dask.org/en/stable/", None),
     "distributed": ("https://distributed.dask.org/en/stable/", None),
 }
diff --git a/doc/python/.gitignore b/doc/python/.gitignore
index b7265688a..843a492dd 100644
--- a/doc/python/.gitignore
+++ b/doc/python/.gitignore
@@ -1 +1,2 @@
-examples
\ No newline at end of file
+examples
+dask-examples
\ No newline at end of file
diff --git a/doc/python/index.rst b/doc/python/index.rst
index cf986bff4..cffc8a7fd 100644
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -14,3 +14,4 @@ Contents
   callbacks
   model
   examples/index
+  dask-examples/index
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index efe455b1e..84b8e9435 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -63,15 +63,17 @@ on a dask cluster:
             evals=[(dtrain, "train")],
         )
 
-Here we first create a cluster in single-node mode with ``dask.distributed.LocalCluster``, then
-connect a ``dask.distributed.Client`` to this cluster, setting up an environment for later
-computation.  Notice that the cluster construction is guared by ``__name__ == "__main__"``, which is
-necessary otherwise there might be obscure errors.
+Here we first create a cluster in single-node mode with
+:py:class:`distributed.LocalCluster`, then connect a :py:class:`distributed.Client` to
+this cluster, setting up an environment for later computation.  Notice that the cluster
+construction is guared by ``__name__ == "__main__"``, which is necessary otherwise there
+might be obscure errors.
 
-We then create a ``DaskDMatrix`` object and pass it to ``train``, along with some other parameters,
-much like XGBoost's normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must
-be either `Dask DataFrame <https://examples.dask.org/dataframe.html>`_ or
-`Dask Array <https://examples.dask.org/array.html>`_ instances.
+We then create a :py:class:`xgboost.dask.DaskDMatrix` object and pass it to
+:py:func:`xgboost.dask.train`, along with some other parameters, much like XGBoost's
+normal, non-dask interface. Unlike that interface, ``data`` and ``label`` must be either
+:py:class:`Dask DataFrame <dask.dataframe.DataFrame>` or :py:class:`Dask Array
+<dask.array.Array>` instances.
 
 The primary difference with XGBoost's dask interface is
 we pass our dask client as an additional argument for carrying out the computation. Note that if
@@ -86,7 +88,7 @@ returns a model and the computation history as a Python dictionary:
   {'booster': Booster,
    'history': dict}
 
-For prediction, pass the ``output`` returned by ``train`` into ``xgb.dask.predict``:
+For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:
 
 .. code-block:: python
 
@@ -105,14 +107,15 @@ computation a bit faster when meta information like ``base_margin`` is not neede
 
 Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
 is a ``DaskDMatrix`` or ``da.Array``.  When putting dask collection directly into the
-``predict`` function or using ``inplace_predict``, the output type depends on input data.
-See next section for details.
+``predict`` function or using :py:func:`xgboost.dask.inplace_predict`, the output type
+depends on input data.  See next section for details.
 
 Alternatively, XGBoost also implements the Scikit-Learn interface with
-``DaskXGBClassifier``, ``DaskXGBRegressor``, ``DaskXGBRanker`` and 2 random forest
-variances.  This wrapper is similar to the single node Scikit-Learn interface in xgboost,
-with dask collection as inputs and has an additional ``client`` attribute.  See following
-sections and ``xgboost/demo/dask`` for more examples.
+:py:class:`~xgboost.dask.DaskXGBClassifier`, :py:class:`~xgboost.dask.DaskXGBRegressor`,
+:py:class:`~xgboost.dask.DaskXGBRanker` and 2 random forest variances.  This wrapper is
+similar to the single node Scikit-Learn interface in xgboost, with dask collection as
+inputs and has an additional ``client`` attribute.  See following sections and
+:ref:`sphx_glr_python_dask-examples` for more examples.
 
 
 ******************
@@ -152,7 +155,7 @@ depending on output shape.  For example, when shap based prediction is used, the
 value can have 3 or 4 dimensions , in such cases an ``Array`` is always returned.
 
 The performance of running prediction, either using ``predict`` or ``inplace_predict``, is
-sensitive to number of blocks.  Internally, it's implemented using ``da.map_blocks`` or
+sensitive to number of blocks.  Internally, it's implemented using ``da.map_blocks`` and
 ``dd.map_partitions``.  When number of partitions is large and each of them have only
 small amount of data, the overhead of calling predict becomes visible.  On the other hand,
 if not using GPU, the number of threads used for prediction on each block matters.  Right
@@ -160,7 +163,7 @@ now, xgboost uses single thread for each partition.  If the number of blocks on
 workers is smaller than number of cores, then the CPU workers might not be fully utilized.
 
 One simple optimization for running consecutive predictions is using
-``distributed.Future``:
+:py:class:`distributed.Future`:
 
 .. code-block:: python
 
@@ -504,8 +507,9 @@ Here are some pratices on reducing memory usage with dask and xgboost.
   nice summary.
 
 - When using GPU input, like dataframe loaded by ``dask_cudf``, you can try
-  ``xgboost.dask.DaskDeviceQuantileDMatrix`` as a drop in replacement for ``DaskDMatrix``
-  to reduce overall memory usage.  See ``demo/dask/gpu_training.py`` for an example.
+  :py:class:`xgboost.dask.DaskDeviceQuantileDMatrix` as a drop in replacement for ``DaskDMatrix``
+  to reduce overall memory usage.  See
+  :ref:`sphx_glr_python_dask-examples_gpu_training.py` for an example.
 
 - Use in-place prediction when possible.