From 7bdedacb54be482d96ada66dd576da57ab41bd9d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 3 Aug 2021 13:11:52 +0800 Subject: [PATCH] Document for `process_type`. (#7135) * Update document for prune and refresh. * Add demo. --- demo/guide-python/update_process.py | 90 +++++++++++++++++++++++++++++ doc/parameter.rst | 2 +- doc/treemethod.rst | 21 ++++++- tests/python-gpu/test_gpu_demos.py | 6 ++ 4 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 demo/guide-python/update_process.py diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py new file mode 100644 index 000000000..53206f9c2 --- /dev/null +++ b/demo/guide-python/update_process.py @@ -0,0 +1,90 @@ +"""Demo for using `process_type` with `prune` and `refresh`. Modifying existing trees is +not a well established use for XGBoost, so feel free to experiment. + +""" + +import xgboost as xgb +from sklearn.datasets import load_boston +import numpy as np + + +def main(): + n_rounds = 32 + + X, y = load_boston(return_X_y=True) + + # Train a model first + X_train = X[: X.shape[0] // 2] + y_train = y[: y.shape[0] // 2] + Xy = xgb.DMatrix(X_train, y_train) + evals_result: xgb.callback.EvaluationMonitor.EvalsLog = {} + booster = xgb.train( + {"tree_method": "gpu_hist", "max_depth": 6}, + Xy, + num_boost_round=n_rounds, + evals=[(Xy, "Train")], + evals_result=evals_result, + ) + SHAP = booster.predict(Xy, pred_contribs=True) + + # Refresh the leaf value and tree statistic + X_refresh = X[X.shape[0] // 2:] + y_refresh = y[y.shape[0] // 2:] + Xy_refresh = xgb.DMatrix(X_refresh, y_refresh) + # The model will adapt to other half of the data by changing leaf value (no change in + # split condition) with refresh_leaf set to True. + refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {} + refreshed = xgb.train( + {"process_type": "update", "updater": "refresh", "refresh_leaf": True}, + Xy_refresh, + num_boost_round=n_rounds, + xgb_model=booster, + evals=[(Xy, "Original"), (Xy_refresh, "Train")], + evals_result=refresh_result, + ) + + # Refresh the model without changing the leaf value, but tree statistic including + # cover and weight are refreshed. + refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {} + refreshed = xgb.train( + {"process_type": "update", "updater": "refresh", "refresh_leaf": False}, + Xy_refresh, + num_boost_round=n_rounds, + xgb_model=booster, + evals=[(Xy, "Original"), (Xy_refresh, "Train")], + evals_result=refresh_result, + ) + # Without refreshing the leaf value, resulting trees should be the same with original + # model except for accumulated statistic. The rtol is for floating point error in + # prediction. + np.testing.assert_allclose( + refresh_result["Original"]["rmse"], evals_result["Train"]["rmse"], rtol=1e-5 + ) + # But SHAP value is changed as cover in tree nodes are changed. + refreshed_SHAP = refreshed.predict(Xy, pred_contribs=True) + assert not np.allclose(SHAP, refreshed_SHAP, rtol=1e-3) + + # Prune the trees with smaller max_depth + X_update = X_train + y_update = y_train + Xy_update = xgb.DMatrix(X_update, y_update) + + prune_result: xgb.callback.EvaluationMonitor.EvalsLog = {} + pruned = xgb.train( + {"process_type": "update", "updater": "prune", "max_depth": 2}, + Xy_update, + num_boost_round=n_rounds, + xgb_model=booster, + evals=[(Xy, "Original"), (Xy_update, "Train")], + evals_result=prune_result, + ) + # Have a smaller model, but similar accuracy. + np.testing.assert_allclose( + np.array(prune_result["Original"]["rmse"]), + np.array(prune_result["Train"]["rmse"]), + atol=1e-5 + ) + + +if __name__ == "__main__": + main() diff --git a/doc/parameter.rst b/doc/parameter.rst index 523023b51..29cc1692a 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -177,7 +177,7 @@ Parameters for Tree Booster - ``grow_gpu_hist``: Grow tree with GPU. - ``sync``: synchronizes trees in all distributed nodes. - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed. - - ``prune``: prunes the splits where loss < min_split_loss (or gamma). + - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``. - In a distributed setting, the implicit updater sequence value would be adjusted to ``grow_histmaker,prune`` by default, and you can set ``tree_method`` as ``hist`` to use ``grow_histmaker``. diff --git a/doc/treemethod.rst b/doc/treemethod.rst index beb703145..38d9822b6 100644 --- a/doc/treemethod.rst +++ b/doc/treemethod.rst @@ -78,9 +78,24 @@ slight differences than expectation, which we are currently trying to overcome. Other Updaters ************** -1. ``Pruner``: It prunes the built tree by ``gamma`` parameter. ``pruner`` is usually - used as part of other tree methods. -2. ``Refresh``: Refresh the statistic of built trees on a new training dataset. +1. ``Prune``: It prunes the existing trees. ``prune`` is usually used as part of other + tree methods. To use pruner independently, one needs to set the process type to update + by: ``{"process_type": "update", "updater": "prune"}``. With this set of parameters, + during trianing, XGBOost will prune the existing trees according to 2 parameters + ``min_split_loss (gamma)`` and ``max_depth``. + +2. ``Refresh``: Refresh the statistic of built trees on a new training dataset. Like the + pruner, To use refresh independently, one needs to set the process type to update: + ``{"process_type": "update", "updater": "refresh"}``. During training, the updater will + change statistics like ``cover`` and ``weight`` according to the new training dataset. + When ``refresh_leaf`` is also set to true (default), XGBoost will update the leaf value + according to the new leaf weight, but the tree structure (split condition) itself + doesn't change. + + There are examples on both training continuation (adding new trees) and using update + process on ``demo/guide-python``. Also checkout the ``process_type`` parameter in + :doc:`parameter`. + 3. ``Sync``: Synchronize the tree among workers when running distributed training. **************** diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py index 7e79378a3..df2021c71 100644 --- a/tests/python-gpu/test_gpu_demos.py +++ b/tests/python-gpu/test_gpu_demos.py @@ -14,6 +14,12 @@ def test_data_iterator(): subprocess.check_call(cmd) +def test_update_process_demo(): + script = os.path.join(td.PYTHON_DEMO_DIR, 'update_process.py') + cmd = ['python', script] + subprocess.check_call(cmd) + + @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.skipif(**tm.no_cupy())