Support GPU training in the NVFlare demo (#7965)

This commit is contained in:
Rong Ou 2022-06-02 06:52:36 -07:00 committed by GitHub
parent 6b55150e80
commit 31e6902e43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 26 additions and 3 deletions

View File

@ -3,6 +3,8 @@
This directory contains a demo of Federated Learning using This directory contains a demo of Federated Learning using
[NVFlare](https://nvidia.github.io/NVFlare/). [NVFlare](https://nvidia.github.io/NVFlare/).
## Training with CPU only
To run the demo, first build XGBoost with the federated learning plugin enabled (see the To run the demo, first build XGBoost with the federated learning plugin enabled (see the
[README](../../plugin/federated/README.md)). [README](../../plugin/federated/README.md)).
@ -53,3 +55,12 @@ Finally, shutdown everything from the admin CLI:
shutdown client shutdown client
shutdown server shutdown server
``` ```
## Training with GPUs
To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
turned off (see the [README](../../plugin/federated/README.md)).
Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
above.

View File

@ -12,7 +12,8 @@
"world_size": 2, "world_size": 2,
"server_cert_path": "server-cert.pem", "server_cert_path": "server-cert.pem",
"client_key_path": "client-key.pem", "client_key_path": "client-key.pem",
"client_cert_path": "client-cert.pem" "client_cert_path": "client-cert.pem",
"use_gpus": "false"
} }
} }
} }

View File

@ -16,7 +16,7 @@ class SupportedTasks(object):
class XGBoostTrainer(Executor): class XGBoostTrainer(Executor):
def __init__(self, server_address: str, world_size: int, server_cert_path: str, def __init__(self, server_address: str, world_size: int, server_cert_path: str,
client_key_path: str, client_cert_path: str): client_key_path: str, client_cert_path: str, use_gpus: bool):
"""Trainer for federated XGBoost. """Trainer for federated XGBoost.
Args: Args:
@ -32,6 +32,7 @@ class XGBoostTrainer(Executor):
self._server_cert_path = server_cert_path self._server_cert_path = server_cert_path
self._client_key_path = client_key_path self._client_key_path = client_key_path
self._client_cert_path = client_cert_path self._client_cert_path = client_cert_path
self._use_gpus = use_gpus
def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
abort_signal: Signal) -> Shareable: abort_signal: Signal) -> Shareable:
@ -66,6 +67,10 @@ class XGBoostTrainer(Executor):
# Specify parameters via map, definition are same as c++ version # Specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
if self._use_gpus:
self.log_info(fl_ctx, f'Training with GPU {rank}')
param['tree_method'] = 'gpu_hist'
param['gpu_id'] = rank
# Specify validations set to watch performance # Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')] watchlist = [(dtest, 'eval'), (dtrain, 'train')]

View File

@ -20,7 +20,12 @@ Build the Plugin
# Under xgboost source tree. # Under xgboost source tree.
mkdir build mkdir build
cd build cd build
cmake .. -GNinja -DPLUGIN_FEDERATED=ON # For now NCCL needs to be turned off.
cmake .. -GNinja\
-DPLUGIN_FEDERATED=ON\
-DUSE_CUDA=ON\
-DBUILD_WITH_CUDA_CUB=ON\
-DUSE_NCCL=OFF
ninja ninja
cd ../python-package cd ../python-package
pip install -e . # or equivalently python setup.py develop pip install -e . # or equivalently python setup.py develop
@ -31,5 +36,6 @@ Test Federated XGBoost
```shell ```shell
# Under xgboost source tree. # Under xgboost source tree.
cd tests/distributed cd tests/distributed
# This tests both CPU training (`hist`) and GPU training (`gpu_hist`).
./runtests-federated.sh ./runtests-federated.sh
``` ```