From 7bdedacb54be482d96ada66dd576da57ab41bd9d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 3 Aug 2021 13:11:52 +0800
Subject: [PATCH] Document for `process_type`. (#7135)

* Update document for prune and refresh.

* Add demo.
---
 demo/guide-python/update_process.py | 90 +++++++++++++++++++++++++++++
 doc/parameter.rst                   |  2 +-
 doc/treemethod.rst                  | 21 ++++++-
 tests/python-gpu/test_gpu_demos.py  |  6 ++
 4 files changed, 115 insertions(+), 4 deletions(-)
 create mode 100644 demo/guide-python/update_process.py

diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py
new file mode 100644
index 000000000..53206f9c2
--- /dev/null
+++ b/demo/guide-python/update_process.py
@@ -0,0 +1,90 @@
+"""Demo for using `process_type` with `prune` and `refresh`.  Modifying existing trees is
+not a well established use for XGBoost, so feel free to experiment.
+
+"""
+
+import xgboost as xgb
+from sklearn.datasets import load_boston
+import numpy as np
+
+
+def main():
+    n_rounds = 32
+
+    X, y = load_boston(return_X_y=True)
+
+    # Train a model first
+    X_train = X[: X.shape[0] // 2]
+    y_train = y[: y.shape[0] // 2]
+    Xy = xgb.DMatrix(X_train, y_train)
+    evals_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    booster = xgb.train(
+        {"tree_method": "gpu_hist", "max_depth": 6},
+        Xy,
+        num_boost_round=n_rounds,
+        evals=[(Xy, "Train")],
+        evals_result=evals_result,
+    )
+    SHAP = booster.predict(Xy, pred_contribs=True)
+
+    # Refresh the leaf value and tree statistic
+    X_refresh = X[X.shape[0] // 2:]
+    y_refresh = y[y.shape[0] // 2:]
+    Xy_refresh = xgb.DMatrix(X_refresh, y_refresh)
+    # The model will adapt to other half of the data by changing leaf value (no change in
+    # split condition) with refresh_leaf set to True.
+    refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    refreshed = xgb.train(
+        {"process_type": "update", "updater": "refresh", "refresh_leaf": True},
+        Xy_refresh,
+        num_boost_round=n_rounds,
+        xgb_model=booster,
+        evals=[(Xy, "Original"), (Xy_refresh, "Train")],
+        evals_result=refresh_result,
+    )
+
+    # Refresh the model without changing the leaf value, but tree statistic including
+    # cover and weight are refreshed.
+    refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    refreshed = xgb.train(
+        {"process_type": "update", "updater": "refresh", "refresh_leaf": False},
+        Xy_refresh,
+        num_boost_round=n_rounds,
+        xgb_model=booster,
+        evals=[(Xy, "Original"), (Xy_refresh, "Train")],
+        evals_result=refresh_result,
+    )
+    # Without refreshing the leaf value, resulting trees should be the same with original
+    # model except for accumulated statistic.  The rtol is for floating point error in
+    # prediction.
+    np.testing.assert_allclose(
+        refresh_result["Original"]["rmse"], evals_result["Train"]["rmse"], rtol=1e-5
+    )
+    # But SHAP value is changed as cover in tree nodes are changed.
+    refreshed_SHAP = refreshed.predict(Xy, pred_contribs=True)
+    assert not np.allclose(SHAP, refreshed_SHAP, rtol=1e-3)
+
+    # Prune the trees with smaller max_depth
+    X_update = X_train
+    y_update = y_train
+    Xy_update = xgb.DMatrix(X_update, y_update)
+
+    prune_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    pruned = xgb.train(
+        {"process_type": "update", "updater": "prune", "max_depth": 2},
+        Xy_update,
+        num_boost_round=n_rounds,
+        xgb_model=booster,
+        evals=[(Xy, "Original"), (Xy_update, "Train")],
+        evals_result=prune_result,
+    )
+    # Have a smaller model, but similar accuracy.
+    np.testing.assert_allclose(
+        np.array(prune_result["Original"]["rmse"]),
+        np.array(prune_result["Train"]["rmse"]),
+        atol=1e-5
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 523023b51..29cc1692a 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -177,7 +177,7 @@ Parameters for Tree Booster
     - ``grow_gpu_hist``: Grow tree with GPU.
     - ``sync``: synchronizes trees in all distributed nodes.
     - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
-    - ``prune``: prunes the splits where loss < min_split_loss (or gamma).
+    - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
 
   - In a distributed setting, the implicit updater sequence value would be adjusted to ``grow_histmaker,prune`` by default, and you can set ``tree_method`` as ``hist`` to use ``grow_histmaker``.
 
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index beb703145..38d9822b6 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -78,9 +78,24 @@ slight differences than expectation, which we are currently trying to overcome.
 Other Updaters
 **************
 
-1. ``Pruner``: It prunes the built tree by ``gamma`` parameter.  ``pruner`` is usually
-   used as part of other tree methods.
-2. ``Refresh``: Refresh the statistic of built trees on a new training dataset.
+1. ``Prune``: It prunes the existing trees.  ``prune`` is usually used as part of other
+   tree methods.  To use pruner independently, one needs to set the process type to update
+   by: ``{"process_type": "update", "updater": "prune"}``.  With this set of parameters,
+   during trianing, XGBOost will prune the existing trees according to 2 parameters
+   ``min_split_loss (gamma)`` and ``max_depth``.
+
+2. ``Refresh``: Refresh the statistic of built trees on a new training dataset.  Like the
+  pruner, To use refresh independently, one needs to set the process type to update:
+  ``{"process_type": "update", "updater": "refresh"}``.  During training, the updater will
+  change statistics like ``cover`` and ``weight`` according to the new training dataset.
+  When ``refresh_leaf`` is also set to true (default), XGBoost will update the leaf value
+  according to the new leaf weight, but the tree structure (split condition) itself
+  doesn't change.
+
+  There are examples on both training continuation (adding new trees) and using update
+  process on ``demo/guide-python``.  Also checkout the ``process_type`` parameter in
+  :doc:`parameter`.
+
 3. ``Sync``: Synchronize the tree among workers when running distributed training.
 
 ****************
diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py
index 7e79378a3..df2021c71 100644
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -14,6 +14,12 @@ def test_data_iterator():
     subprocess.check_call(cmd)
 
 
+def test_update_process_demo():
+    script = os.path.join(td.PYTHON_DEMO_DIR, 'update_process.py')
+    cmd = ['python', script]
+    subprocess.check_call(cmd)
+
+
 @pytest.mark.skipif(**tm.no_dask())
 @pytest.mark.skipif(**tm.no_dask_cuda())
 @pytest.mark.skipif(**tm.no_cupy())