diff --git a/doc/prediction.rst b/doc/prediction.rst
index dbfcc9cb8..853254bba 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -67,6 +67,18 @@ the 3-class classification dataset, and want to use the first 2 iterations of tr
 prediction, you need to provide ``iteration_range=(0, 2)``.  Then the first :math:`2
 \times 3 \times 4` trees will be used in this prediction.
 
+**************
+Early Stopping
+**************
+
+When a model is trained with early stopping, there is an inconsistent behavior between
+native Python interface and sklearn/R interfaces.  By default on R and sklearn interfaces,
+the ``best_iteration`` is automatically used so prediction comes from the best model.  But
+with the native Python interface :py:meth:`xgboost.Booster.predict` and
+:py:meth:`xgboost.Booster.inplace_predict` uses the full model.  Users can use
+``best_iteration`` attribute with ``iteration_range`` parameter to achieve the same
+behavior.  Also the ``save_best`` parameter from :py:obj:`xgboost.callback.EarlyStopping`
+might be useful.
 
 *********
 Predictor
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index 6bb138a2f..e39e36210 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -183,7 +183,7 @@ Early stopping requires at least one set in ``evals``. If there's more than one,
 
 The model will train until the validation score stops improving. Validation error needs to decrease at least every ``early_stopping_rounds`` to continue training.
 
-If early stopping occurs, the model will have three additional fields: ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. Note that :py:meth:`xgboost.train` will return a model from the last iteration, not the best one.
+If early stopping occurs, the model will have two additional fields: ``bst.best_score``, ``bst.best_iteration``.  Note that :py:meth:`xgboost.train` will return a model from the last iteration, not the best one.
 
 This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). Note that if you specify more than one evaluation metric the last one in ``param['eval_metric']`` is used for early stopping.
 
@@ -198,11 +198,11 @@ A model that has been trained or loaded can perform predictions on data sets.
   dtest = xgb.DMatrix(data)
   ypred = bst.predict(dtest)
 
-If early stopping is enabled during training, you can get predictions from the best iteration with ``bst.best_ntree_limit``:
+If early stopping is enabled during training, you can get predictions from the best iteration with ``bst.best_iteration``:
 
 .. code-block:: python
 
-  ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
+  ypred = bst.predict(dtest, iteration_range=(0, bst.best_iteration))
 
 Plotting
 --------
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 400481f4e..e90889da2 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -176,7 +176,7 @@ One simple optimization for running consecutive predictions is using
         shap_f = xgb.dask.predict(client, booster_f, X, pred_contribs=True)
         futures.append(shap_f)
 
-  results = client.gather(futures)
+    results = client.gather(futures)
 
 
 This is only available on functional interface, as the Scikit-Learn wrapper doesn't know
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 75b701a05..a8916b783 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -744,13 +744,13 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
  *                      following available fields in the JSON object:
  *
  *    "type": [0, 6]
- *      0: normal prediction
- *      1: output margin
- *      2: predict contribution
- *      3: predict approximated contribution
- *      4: predict feature interaction
- *      5: predict approximated feature interaction
- *      6: predict leaf
+ *      - 0: normal prediction
+ *      - 1: output margin
+ *      - 2: predict contribution
+ *      - 3: predict approximated contribution
+ *      - 4: predict feature interaction
+ *      - 5: predict approximated feature interaction
+ *      - 6: predict leaf
  *    "training": bool
  *      Whether the prediction function is used as part of a training loop.  **Not used
  *      for inplace prediction**.
@@ -773,7 +773,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
  *      disregarding the use of multi-class model, and leaf prediction will output 4-dim
  *      array representing: (n_samples, n_iterations, n_classes, n_trees_in_forest)
  *
- *   Run a normal prediction with strict output shape, 2 dim for softprob , 1 dim for others.
+ *   Example JSON input for running a normal prediction with strict output shape, 2 dim
+ *   for softprob , 1 dim for others.
  *   \code
  *      {
  *         "type": 0,
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 8430d5acd..86cf77243 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1683,7 +1683,9 @@ class Booster(object):
         iteration_range: Tuple[int, int] = (0, 0),
         strict_shape: bool = False,
     ) -> np.ndarray:
-        """Predict with data.
+        """Predict with data.  The full model will be used unless `iteration_range` is specified,
+        meaning user have to either slice the model or use the ``best_iteration``
+        attribute to get prediction from best model returned from early stopping.
 
         .. note::
 
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index b9b7916a8..2e9f2f408 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -794,8 +794,8 @@ class XGBModel(XGBModelBase):
         base_margin: Optional[array_like] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> np.ndarray:
-        """
-        Predict with `X`.
+        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
+        is used automatically.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
@@ -819,6 +819,7 @@ class XGBModel(XGBModelBase):
             used in this prediction.
 
             .. versionadded:: 1.4.0
+
         Returns
         -------
         prediction
@@ -860,7 +861,8 @@ class XGBModel(XGBModelBase):
         ntree_limit: int = 0,
         iteration_range: Optional[Tuple[int, int]] = None
     ) -> np.ndarray:
-        """Return the predicted leaf every tree for each sample.
+        """Return the predicted leaf every tree for each sample. If the model is trained with
+        early stopping, then `best_iteration` is used automatically.
 
         Parameters
         ----------
@@ -879,6 +881,7 @@ class XGBModel(XGBModelBase):
             For each datapoint x in X and for each tree, return the index of the
             leaf x ends up in. Leaves are numbered within
             ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering.
+
         """
         iteration_range = _convert_ntree_limit(
             self.get_booster(), ntree_limit, iteration_range