Fix federated learning demos and tests (#9488)
This commit is contained in:
parent
b2e93d2742
commit
12fe2fc06c
1
demo/nvflare/.gitignore
vendored
Normal file
1
demo/nvflare/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
!config
|
||||
23
demo/nvflare/config/config_fed_client.json
Normal file
23
demo/nvflare/config/config_fed_client.json
Normal file
@ -0,0 +1,23 @@
|
||||
{
|
||||
"format_version": 2,
|
||||
"executors": [
|
||||
{
|
||||
"tasks": [
|
||||
"train"
|
||||
],
|
||||
"executor": {
|
||||
"path": "trainer.XGBoostTrainer",
|
||||
"args": {
|
||||
"server_address": "localhost:9091",
|
||||
"world_size": 2,
|
||||
"server_cert_path": "server-cert.pem",
|
||||
"client_key_path": "client-key.pem",
|
||||
"client_cert_path": "client-cert.pem",
|
||||
"use_gpus": false
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"task_result_filters": [],
|
||||
"task_data_filters": []
|
||||
}
|
||||
22
demo/nvflare/config/config_fed_server.json
Normal file
22
demo/nvflare/config/config_fed_server.json
Normal file
@ -0,0 +1,22 @@
|
||||
{
|
||||
"format_version": 2,
|
||||
"server": {
|
||||
"heart_beat_timeout": 600
|
||||
},
|
||||
"task_data_filters": [],
|
||||
"task_result_filters": [],
|
||||
"workflows": [
|
||||
{
|
||||
"id": "server_workflow",
|
||||
"path": "controller.XGBoostController",
|
||||
"args": {
|
||||
"port": 9091,
|
||||
"world_size": 2,
|
||||
"server_key_path": "server-key.pem",
|
||||
"server_cert_path": "server-cert.pem",
|
||||
"client_cert_path": "client-cert.pem"
|
||||
}
|
||||
}
|
||||
],
|
||||
"components": []
|
||||
}
|
||||
@ -6,7 +6,7 @@ This directory contains a demo of Horizontal Federated Learning using
|
||||
## Training with CPU only
|
||||
|
||||
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
||||
[README](../../plugin/federated/README.md)).
|
||||
[README](../../../plugin/federated/README.md)).
|
||||
|
||||
Install NVFlare (note that currently NVFlare only supports Python 3.8):
|
||||
```shell
|
||||
|
||||
@ -16,7 +16,7 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test
|
||||
|
||||
nvflare poc -n 2 --prepare
|
||||
mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
|
||||
for (( site=1; site<=world_size; site++ )); do
|
||||
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
|
||||
|
||||
@ -6,7 +6,7 @@ This directory contains a demo of Vertical Federated Learning using
|
||||
## Training with CPU only
|
||||
|
||||
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
||||
[README](../../plugin/federated/README.md)).
|
||||
[README](../../../plugin/federated/README.md)).
|
||||
|
||||
Install NVFlare (note that currently NVFlare only supports Python 3.8):
|
||||
```shell
|
||||
|
||||
@ -16,7 +16,7 @@ class SupportedTasks(object):
|
||||
|
||||
class XGBoostTrainer(Executor):
|
||||
def __init__(self, server_address: str, world_size: int, server_cert_path: str,
|
||||
client_key_path: str, client_cert_path: str):
|
||||
client_key_path: str, client_cert_path: str, use_gpus: bool):
|
||||
"""Trainer for federated XGBoost.
|
||||
|
||||
Args:
|
||||
@ -32,6 +32,7 @@ class XGBoostTrainer(Executor):
|
||||
self._server_cert_path = server_cert_path
|
||||
self._client_key_path = client_key_path
|
||||
self._client_cert_path = client_cert_path
|
||||
self._use_gpus = use_gpus
|
||||
|
||||
def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
|
||||
abort_signal: Signal) -> Shareable:
|
||||
@ -81,6 +82,8 @@ class XGBoostTrainer(Executor):
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'auc',
|
||||
}
|
||||
if self._use_gpus:
|
||||
self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
|
||||
|
||||
# specify validations set to watch performance
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
|
||||
@ -56,7 +56,7 @@ fi
|
||||
|
||||
nvflare poc -n 2 --prepare
|
||||
mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
|
||||
cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
|
||||
cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
|
||||
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
|
||||
for (( site=1; site<=world_size; site++ )); do
|
||||
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
|
||||
|
||||
@ -11,7 +11,7 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
|
||||
openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
|
||||
|
||||
# Split train and test files manually to simulate a federated environment.
|
||||
split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.train agaricus.txt.train-
|
||||
split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.test agaricus.txt.test-
|
||||
split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.train agaricus.txt.train-
|
||||
split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.test agaricus.txt.test-
|
||||
|
||||
python test_federated.py "${world_size}"
|
||||
|
||||
@ -35,14 +35,14 @@ def run_worker(port: int, world_size: int, rank: int, with_ssl: bool, with_gpu:
|
||||
# Always call this before using distributed module
|
||||
with xgb.collective.CommunicatorContext(**communicator_env):
|
||||
# Load file, file will not be sharded in federated mode.
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank)
|
||||
dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank)
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train-%02d?format=libsvm' % rank)
|
||||
dtest = xgb.DMatrix('agaricus.txt.test-%02d?format=libsvm' % rank)
|
||||
|
||||
# Specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
if with_gpu:
|
||||
param['tree_method'] = 'gpu_hist'
|
||||
param['gpu_id'] = rank
|
||||
param['tree_method'] = 'hist'
|
||||
param['device'] = f"cuda:{rank}"
|
||||
|
||||
# Specify validations set to watch performance
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user