diff --git a/demo/dask/cpu_survival.py b/demo/dask/cpu_survival.py
index 7fe0570de..8bf464ce2 100644
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@@ -9,7 +9,7 @@ import os
 import dask.dataframe as dd
 from dask.distributed import Client, LocalCluster
 
-import xgboost as xgb
+from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
@@ -48,14 +48,14 @@ def main(client):
         "lambda": 0.01,
         "alpha": 0.02,
     }
-    output = xgb.dask.train(
+    output = dxgb.train(
         client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
     )
     bst = output["booster"]
     history = output["history"]
 
     # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history: ", history)
 
     # Uncomment the following line to save the model to the disk
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
index 811af5cd3..0f3316741 100644
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -6,7 +6,7 @@ Example of training with Dask on CPU
 from dask import array as da
 from dask.distributed import Client, LocalCluster
 
-import xgboost as xgb
+from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
@@ -25,7 +25,7 @@ def main(client):
     # distributed version of train returns a dictionary containing the
     # resulting booster and evaluation history obtained from
     # evaluation metrics.
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {"verbosity": 1, "tree_method": "hist"},
         dtrain,
@@ -36,7 +36,7 @@ def main(client):
     history = output["history"]
 
     # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history:", history)
     return prediction
 
diff --git a/demo/dask/dask_callbacks.py b/demo/dask/dask_callbacks.py
index 408297d9e..a4b0f5648 100644
--- a/demo/dask/dask_callbacks.py
+++ b/demo/dask/dask_callbacks.py
@@ -8,6 +8,7 @@ from dask_ml.datasets import make_regression
 from dask_ml.model_selection import train_test_split
 
 import xgboost as xgb
+import xgboost.dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
@@ -61,7 +62,7 @@ def main(client):
     dtrain = DaskDMatrix(client, X_train, y_train)
     dtest = DaskDMatrix(client, X_test, y_test)
 
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {
             "verbosity": 1,
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index 6eea00692..fd5b35bf3 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -8,7 +8,6 @@ from dask import dataframe as dd
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
 
-import xgboost as xgb
 from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
@@ -21,7 +20,7 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
     # Use train method from xgboost.dask instead of xgboost.  This distributed version
     # of train returns a dictionary containing the resulting booster and evaluation
     # history obtained from evaluation metrics.
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {
             "verbosity": 2,
@@ -37,7 +36,7 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
     history = output["history"]
 
     # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history:", history)
     return prediction
 
@@ -56,14 +55,14 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
     # be used for anything else other than training unless a reference is specified. See
     # the `ref` argument of `DaskQuantileDMatrix`.
     dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
         dtrain,
         num_boost_round=4,
     )
 
-    prediction = xgb.dask.predict(client, output, X)
+    prediction = dxgb.predict(client, output, X)
     return prediction
 
 
diff --git a/demo/dask/sklearn_cpu_training.py b/demo/dask/sklearn_cpu_training.py
index 12d55493c..38ea25e61 100644
--- a/demo/dask/sklearn_cpu_training.py
+++ b/demo/dask/sklearn_cpu_training.py
@@ -5,7 +5,7 @@ Use scikit-learn regressor interface with CPU histogram tree method
 from dask import array as da
 from dask.distributed import Client, LocalCluster
 
-import xgboost
+from xgboost import dask as dxgb
 
 
 def main(client):
@@ -16,7 +16,7 @@ def main(client):
     X = da.random.random((m, n), partition_size)
     y = da.random.random(m, partition_size)
 
-    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+    regressor = dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
     regressor.set_params(tree_method="hist")
     # assigning client here is optional
     regressor.client = client
diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py
index 32a994464..768690995 100644
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -9,7 +9,7 @@ from dask.distributed import Client
 # It's recommended to use dask_cuda for GPU assignment
 from dask_cuda import LocalCUDACluster
 
-import xgboost
+from xgboost import dask as dxgb
 
 
 def main(client):
@@ -20,7 +20,7 @@ def main(client):
     X = da.random.random((m, n), partition_size)
     y = da.random.random(m, partition_size)
 
-    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
+    regressor = dxgb.DaskXGBRegressor(verbosity=1)
     # set the device to CUDA
     regressor.set_params(tree_method="hist", device="cuda")
     # assigning client here is optional
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 7ab251bcf..148230fe6 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -39,7 +39,8 @@ on a dask cluster:
 
 .. code-block:: python
 
-    import xgboost as xgb
+    from xgboost import dask as dxgb
+
     import dask.array as da
     import dask.distributed
 
@@ -53,11 +54,11 @@ on a dask cluster:
         X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features))
         y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
 
-        dtrain = xgb.dask.DaskDMatrix(client, X, y)
+        dtrain = dxgb.DaskDMatrix(client, X, y)
         # or
-        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
+        # dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
 
-        output = xgb.dask.train(
+        output = dxgb.train(
             client,
             {"verbosity": 2, "tree_method": "hist", "objective": "reg:squarederror"},
             dtrain,
@@ -87,25 +88,27 @@ returns a model and the computation history as a Python dictionary:
 
 .. code-block:: python
 
-  {'booster': Booster,
-   'history': dict}
+  {
+    "booster": Booster,
+    "history": dict,
+  }
 
 For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:
 
 .. code-block:: python
 
-  prediction = xgb.dask.predict(client, output, dtrain)
+  prediction = dxgb.predict(client, output, dtrain)
   # Or equivalently, pass ``output['booster']``:
-  prediction = xgb.dask.predict(client, output['booster'], dtrain)
+  prediction = dxgb.predict(client, output['booster'], dtrain)
 
 Eliminating the construction of DaskDMatrix is also possible, this can make the
 computation a bit faster when meta information like ``base_margin`` is not needed:
 
 .. code-block:: python
 
-  prediction = xgb.dask.predict(client, output, X)
+  prediction = dxgb.predict(client, output, X)
   # Use inplace version.
-  prediction = xgb.dask.inplace_predict(client, output, X)
+  prediction = dxgb.inplace_predict(client, output, X)
 
 Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
 is a ``DaskDMatrix`` or ``da.Array``.  When putting dask collection directly into the
@@ -134,14 +137,14 @@ both memory usage and prediction time.
 .. code-block:: python
 
   # dtrain is the DaskDMatrix defined above.
-  prediction = xgb.dask.predict(client, booster, dtrain)
+  prediction = dxgb.predict(client, booster, dtrain)
 
 or equivalently:
 
 .. code-block:: python
 
   # where X is a dask DataFrame or dask Array.
-  prediction = xgb.dask.predict(client, booster, X)
+  prediction = dxgb.predict(client, booster, X)
 
 Also for inplace prediction:
 
@@ -149,7 +152,7 @@ Also for inplace prediction:
 
   # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
   booster.set_param({"device": "cuda"})
-  prediction = xgb.dask.inplace_predict(client, booster, X)
+  prediction = dxgb.inplace_predict(client, booster, X)
 
 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
 type is ``dd.DataFrame``, output can be ``dd.Series``, ``dd.DataFrame`` or ``da.Array``,
@@ -174,7 +177,7 @@ One simple optimization for running consecutive predictions is using
     futures = []
     for X in dataset:
         # Here we pass in a future instead of concrete booster
-        shap_f = xgb.dask.predict(client, booster_f, X, pred_contribs=True)
+        shap_f = dxgb.predict(client, booster_f, X, pred_contribs=True)
         futures.append(shap_f)
 
     results = client.gather(futures)
@@ -186,7 +189,7 @@ Scikit-Learn wrapper object:
 
 .. code-block:: python
 
-    cls = xgb.dask.DaskXGBClassifier()
+    cls = dxgb.DaskXGBClassifier()
     cls.fit(X, y)
 
     booster = cls.get_booster()
@@ -207,12 +210,12 @@ collection.
 .. code-block:: python
 
     from distributed import LocalCluster, Client
-    import xgboost as xgb
+    from xgboost import dask as dxgb
 
 
     def main(client: Client) -> None:
         X, y = load_data()
-        clf = xgb.dask.DaskXGBClassifier(n_estimators=100, tree_method="hist")
+        clf = dxgb.DaskXGBClassifier(n_estimators=100, tree_method="hist")
         clf.client = client  # assign the client
         clf.fit(X, y, eval_set=[(X, y)])
         proba = clf.predict_proba(X)
@@ -242,7 +245,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
 
   from dask_kubernetes import KubeCluster  # Need to install the ``dask-kubernetes`` package
   from dask.distributed import Client
-  import xgboost as xgb
+  from xgboost import dask as dxgb
   import dask
   import dask.array as da
 
@@ -265,7 +268,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
           X = da.random.random(size=(m, n), chunks=100)
           y = da.random.random(size=(m, ), chunks=100)
 
-          regressor = xgb.dask.DaskXGBRegressor(n_estimators=10, missing=0.0)
+          regressor = dxgb.DaskXGBRegressor(n_estimators=10, missing=0.0)
           regressor.client = client
           regressor.set_params(tree_method='hist', device="cuda")
           regressor.fit(X, y, eval_set=[(X, y)])
@@ -298,7 +301,7 @@ threads in each process for training.  But if ``nthread`` parameter is set:
 
 .. code-block:: python
 
-    output = xgb.dask.train(
+    output = dxgb.train(
         client,
         {"verbosity": 1, "nthread": 8, "tree_method": "hist"},
         dtrain,
@@ -330,12 +333,12 @@ Functional interface:
 
     async with dask.distributed.Client(scheduler_address, asynchronous=True) as client:
         X, y = generate_array()
-        m = await xgb.dask.DaskDMatrix(client, X, y)
-        output = await xgb.dask.train(client, {}, dtrain=m)
+        m = await dxgb.DaskDMatrix(client, X, y)
+        output = await dxgb.train(client, {}, dtrain=m)
 
-        with_m = await xgb.dask.predict(client, output, m)
-        with_X = await xgb.dask.predict(client, output, X)
-        inplace = await xgb.dask.inplace_predict(client, output, X)
+        with_m = await dxgb.predict(client, output, m)
+        with_X = await dxgb.predict(client, output, X)
+        inplace = await dxgb.inplace_predict(client, output, X)
 
         # Use ``client.compute`` instead of the ``compute`` method from dask collection
         print(await client.compute(with_m))
@@ -349,7 +352,7 @@ actual computation will return a coroutine and hence require awaiting:
 
     async with dask.distributed.Client(scheduler_address, asynchronous=True) as client:
         X, y = generate_array()
-        regressor = await xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+        regressor = await dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
         regressor.set_params(tree_method='hist')  # trivial method, synchronous operation
         regressor.client = client  #  accessing attribute, synchronous operation
         regressor = await regressor.fit(X, y, eval_set=[(X, y)])
@@ -371,7 +374,7 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
 .. code-block:: python
 
     import dask.array as da
-    import xgboost as xgb
+    from xgboost import dask as dxgb
 
     num_rows = 1e6
     num_features = 100
@@ -398,19 +401,19 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
         chunks=(rows_per_chunk, 1)
     )
 
-    dtrain = xgb.dask.DaskDMatrix(
+    dtrain = dxgb.DaskDMatrix(
         client=client,
         data=data,
         label=labels
     )
 
-    dvalid = xgb.dask.DaskDMatrix(
+    dvalid = dxgb.DaskDMatrix(
         client=client,
         data=X_eval,
         label=y_eval
     )
 
-    result = xgb.dask.train(
+    result = dxgb.train(
         client=client,
         params={
             "objective": "reg:squarederror",
@@ -421,7 +424,7 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
         early_stopping_rounds=3
     )
 
-When validation sets are provided to ``xgb.dask.train()`` in this way, the model object returned by ``xgb.dask.train()`` contains a history of evaluation metrics for each validation set, across all boosting rounds.
+When validation sets are provided to :py:func:`xgboost.dask.train` in this way, the model object returned by :py:func:`xgboost.dask.train` contains a history of evaluation metrics for each validation set, across all boosting rounds.
 
 .. code-block:: python
 
@@ -463,7 +466,7 @@ interface, including callback functions, custom evaluation metric and objective:
         save_best=True,
     )
 
-    booster = xgb.dask.train(
+    booster = dxgb.train(
         client,
         params={
             "objective": "binary:logistic",