From ca998df9122eae085e9f91b7c41144a750c3e826 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 31 Mar 2021 15:43:11 +0800
Subject: [PATCH] Clarify the behavior of `use_rmm`. (#6808)

* Clarify the `use_rmm` flag in document and demo.
---
 demo/rmm_plugin/README.md             | 16 ++++++++++++++++
 demo/rmm_plugin/rmm_mgpu_with_dask.py |  8 ++++++--
 demo/rmm_plugin/rmm_singlegpu.py      | 15 ++++++++++-----
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/demo/rmm_plugin/README.md b/demo/rmm_plugin/README.md
index ad73c61f3..bf6e7f12d 100644
--- a/demo/rmm_plugin/README.md
+++ b/demo/rmm_plugin/README.md
@@ -27,5 +27,21 @@ cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_
 cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
 ```
 
+# Informing XGBoost about RMM pool
+
+When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
+allocators, but some small allocations in performance critical areas are using a different
+caching allocator so that we can have better control over memory allocation behavior.
+Users can override this behavior and force the use of rmm for all allocations by setting
+the global configuration ``use_rmm``:
+
+``` python
+with xgb.config_context(use_rmm=True):
+    clf = xgb.XGBClassifier(tree_method="gpu_hist")
+```
+
+Depending on the choice of memory pool size or type of allocator, this may have negative
+performance impact.
+
 * [Using RMM with a single GPU](./rmm_singlegpu.py)
 * [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
diff --git a/demo/rmm_plugin/rmm_mgpu_with_dask.py b/demo/rmm_plugin/rmm_mgpu_with_dask.py
index a147e3072..23c1f794e 100644
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -4,11 +4,14 @@ import dask
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
 
+
 def main(client):
-    # Inform XGBoost that RMM is used for GPU memory allocation
-    xgb.set_config(use_rmm=True)
+    # Optionally force XGBoost to use RMM for all GPU memory allocation, see ./README.md
+    # xgb.set_config(use_rmm=True)
 
     X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
+    # In pratice one should prefer loading the data with dask collections instead of using
+    # `from_array`.
     X = dask.array.from_array(X)
     y = dask.array.from_array(y)
     dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
@@ -22,6 +25,7 @@ def main(client):
     for i, e in enumerate(history['train']['merror']):
         print(f'[{i}] train-merror: {e}')
 
+
 if __name__ == '__main__':
     # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
     # LocalCUDACluster constructor.
diff --git a/demo/rmm_plugin/rmm_singlegpu.py b/demo/rmm_plugin/rmm_singlegpu.py
index 02caa1cc7..6b7d1b58c 100644
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -4,13 +4,18 @@ from sklearn.datasets import make_classification
 
 # Initialize RMM pool allocator
 rmm.reinitialize(pool_allocator=True)
-# Inform XGBoost that RMM is used for GPU memory allocation
-xgb.set_config(use_rmm=True)
+# Optionally force XGBoost to use RMM for all GPU memory allocation, see ./README.md
+# xgb.set_config(use_rmm=True)
 
 X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
 dtrain = xgb.DMatrix(X, label=y)
 
-params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
-          'tree_method': 'gpu_hist'}
+params = {
+    "max_depth": 8,
+    "eta": 0.01,
+    "objective": "multi:softprob",
+    "num_class": 3,
+    "tree_method": "gpu_hist",
+}
 # XGBoost will automatically use the RMM pool allocator
-bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')])
+bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])