Support GPU training in the NVFlare demo (#7965)
This commit is contained in:
parent
6b55150e80
commit
31e6902e43
@ -3,6 +3,8 @@
|
|||||||
This directory contains a demo of Federated Learning using
|
This directory contains a demo of Federated Learning using
|
||||||
[NVFlare](https://nvidia.github.io/NVFlare/).
|
[NVFlare](https://nvidia.github.io/NVFlare/).
|
||||||
|
|
||||||
|
## Training with CPU only
|
||||||
|
|
||||||
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
||||||
[README](../../plugin/federated/README.md)).
|
[README](../../plugin/federated/README.md)).
|
||||||
|
|
||||||
@ -53,3 +55,12 @@ Finally, shutdown everything from the admin CLI:
|
|||||||
shutdown client
|
shutdown client
|
||||||
shutdown server
|
shutdown server
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Training with GPUs
|
||||||
|
|
||||||
|
To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
|
||||||
|
Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
|
||||||
|
turned off (see the [README](../../plugin/federated/README.md)).
|
||||||
|
|
||||||
|
Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
|
||||||
|
above.
|
||||||
|
|||||||
@ -12,7 +12,8 @@
|
|||||||
"world_size": 2,
|
"world_size": 2,
|
||||||
"server_cert_path": "server-cert.pem",
|
"server_cert_path": "server-cert.pem",
|
||||||
"client_key_path": "client-key.pem",
|
"client_key_path": "client-key.pem",
|
||||||
"client_cert_path": "client-cert.pem"
|
"client_cert_path": "client-cert.pem",
|
||||||
|
"use_gpus": "false"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@ class SupportedTasks(object):
|
|||||||
|
|
||||||
class XGBoostTrainer(Executor):
|
class XGBoostTrainer(Executor):
|
||||||
def __init__(self, server_address: str, world_size: int, server_cert_path: str,
|
def __init__(self, server_address: str, world_size: int, server_cert_path: str,
|
||||||
client_key_path: str, client_cert_path: str):
|
client_key_path: str, client_cert_path: str, use_gpus: bool):
|
||||||
"""Trainer for federated XGBoost.
|
"""Trainer for federated XGBoost.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -32,6 +32,7 @@ class XGBoostTrainer(Executor):
|
|||||||
self._server_cert_path = server_cert_path
|
self._server_cert_path = server_cert_path
|
||||||
self._client_key_path = client_key_path
|
self._client_key_path = client_key_path
|
||||||
self._client_cert_path = client_cert_path
|
self._client_cert_path = client_cert_path
|
||||||
|
self._use_gpus = use_gpus
|
||||||
|
|
||||||
def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
|
def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
|
||||||
abort_signal: Signal) -> Shareable:
|
abort_signal: Signal) -> Shareable:
|
||||||
@ -66,6 +67,10 @@ class XGBoostTrainer(Executor):
|
|||||||
|
|
||||||
# Specify parameters via map, definition are same as c++ version
|
# Specify parameters via map, definition are same as c++ version
|
||||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||||
|
if self._use_gpus:
|
||||||
|
self.log_info(fl_ctx, f'Training with GPU {rank}')
|
||||||
|
param['tree_method'] = 'gpu_hist'
|
||||||
|
param['gpu_id'] = rank
|
||||||
|
|
||||||
# Specify validations set to watch performance
|
# Specify validations set to watch performance
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||||
|
|||||||
@ -20,7 +20,12 @@ Build the Plugin
|
|||||||
# Under xgboost source tree.
|
# Under xgboost source tree.
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -GNinja -DPLUGIN_FEDERATED=ON
|
# For now NCCL needs to be turned off.
|
||||||
|
cmake .. -GNinja\
|
||||||
|
-DPLUGIN_FEDERATED=ON\
|
||||||
|
-DUSE_CUDA=ON\
|
||||||
|
-DBUILD_WITH_CUDA_CUB=ON\
|
||||||
|
-DUSE_NCCL=OFF
|
||||||
ninja
|
ninja
|
||||||
cd ../python-package
|
cd ../python-package
|
||||||
pip install -e . # or equivalently python setup.py develop
|
pip install -e . # or equivalently python setup.py develop
|
||||||
@ -31,5 +36,6 @@ Test Federated XGBoost
|
|||||||
```shell
|
```shell
|
||||||
# Under xgboost source tree.
|
# Under xgboost source tree.
|
||||||
cd tests/distributed
|
cd tests/distributed
|
||||||
|
# This tests both CPU training (`hist`) and GPU training (`gpu_hist`).
|
||||||
./runtests-federated.sh
|
./runtests-federated.sh
|
||||||
```
|
```
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user