Fix federated learning demos and tests (#9488)

This commit is contained in:
Sean Yang 2023-08-16 00:25:05 -07:00 committed by GitHub
parent b2e93d2742
commit 12fe2fc06c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 60 additions and 11 deletions

1
demo/nvflare/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
!config

View File

@ -0,0 +1,23 @@
{
"format_version": 2,
"executors": [
{
"tasks": [
"train"
],
"executor": {
"path": "trainer.XGBoostTrainer",
"args": {
"server_address": "localhost:9091",
"world_size": 2,
"server_cert_path": "server-cert.pem",
"client_key_path": "client-key.pem",
"client_cert_path": "client-cert.pem",
"use_gpus": false
}
}
}
],
"task_result_filters": [],
"task_data_filters": []
}

View File

@ -0,0 +1,22 @@
{
"format_version": 2,
"server": {
"heart_beat_timeout": 600
},
"task_data_filters": [],
"task_result_filters": [],
"workflows": [
{
"id": "server_workflow",
"path": "controller.XGBoostController",
"args": {
"port": 9091,
"world_size": 2,
"server_key_path": "server-key.pem",
"server_cert_path": "server-cert.pem",
"client_cert_path": "client-cert.pem"
}
}
],
"components": []
}

View File

@ -6,7 +6,7 @@ This directory contains a demo of Horizontal Federated Learning using
## Training with CPU only ## Training with CPU only
To run the demo, first build XGBoost with the federated learning plugin enabled (see the To run the demo, first build XGBoost with the federated learning plugin enabled (see the
[README](../../plugin/federated/README.md)). [README](../../../plugin/federated/README.md)).
Install NVFlare (note that currently NVFlare only supports Python 3.8): Install NVFlare (note that currently NVFlare only supports Python 3.8):
```shell ```shell

View File

@ -16,7 +16,7 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test
nvflare poc -n 2 --prepare nvflare poc -n 2 --prepare
mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/ cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
for (( site=1; site<=world_size; site++ )); do for (( site=1; site<=world_size; site++ )); do
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/ cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/

View File

@ -6,7 +6,7 @@ This directory contains a demo of Vertical Federated Learning using
## Training with CPU only ## Training with CPU only
To run the demo, first build XGBoost with the federated learning plugin enabled (see the To run the demo, first build XGBoost with the federated learning plugin enabled (see the
[README](../../plugin/federated/README.md)). [README](../../../plugin/federated/README.md)).
Install NVFlare (note that currently NVFlare only supports Python 3.8): Install NVFlare (note that currently NVFlare only supports Python 3.8):
```shell ```shell

View File

@ -16,7 +16,7 @@ class SupportedTasks(object):
class XGBoostTrainer(Executor): class XGBoostTrainer(Executor):
def __init__(self, server_address: str, world_size: int, server_cert_path: str, def __init__(self, server_address: str, world_size: int, server_cert_path: str,
client_key_path: str, client_cert_path: str): client_key_path: str, client_cert_path: str, use_gpus: bool):
"""Trainer for federated XGBoost. """Trainer for federated XGBoost.
Args: Args:
@ -32,6 +32,7 @@ class XGBoostTrainer(Executor):
self._server_cert_path = server_cert_path self._server_cert_path = server_cert_path
self._client_key_path = client_key_path self._client_key_path = client_key_path
self._client_cert_path = client_cert_path self._client_cert_path = client_cert_path
self._use_gpus = use_gpus
def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
abort_signal: Signal) -> Shareable: abort_signal: Signal) -> Shareable:
@ -81,6 +82,8 @@ class XGBoostTrainer(Executor):
'objective': 'binary:logistic', 'objective': 'binary:logistic',
'eval_metric': 'auc', 'eval_metric': 'auc',
} }
if self._use_gpus:
self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
# specify validations set to watch performance # specify validations set to watch performance
watchlist = [(dtest, "eval"), (dtrain, "train")] watchlist = [(dtest, "eval"), (dtrain, "train")]

View File

@ -56,7 +56,7 @@ fi
nvflare poc -n 2 --prepare nvflare poc -n 2 --prepare
mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/ cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
for (( site=1; site<=world_size; site++ )); do for (( site=1; site<=world_size; site++ )); do
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/ cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/

View File

@ -11,7 +11,7 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost" openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
# Split train and test files manually to simulate a federated environment. # Split train and test files manually to simulate a federated environment.
split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.train agaricus.txt.train- split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.train agaricus.txt.train-
split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.test agaricus.txt.test- split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.test agaricus.txt.test-
python test_federated.py "${world_size}" python test_federated.py "${world_size}"

View File

@ -35,14 +35,14 @@ def run_worker(port: int, world_size: int, rank: int, with_ssl: bool, with_gpu:
# Always call this before using distributed module # Always call this before using distributed module
with xgb.collective.CommunicatorContext(**communicator_env): with xgb.collective.CommunicatorContext(**communicator_env):
# Load file, file will not be sharded in federated mode. # Load file, file will not be sharded in federated mode.
dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank) dtrain = xgb.DMatrix('agaricus.txt.train-%02d?format=libsvm' % rank)
dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank) dtest = xgb.DMatrix('agaricus.txt.test-%02d?format=libsvm' % rank)
# Specify parameters via map, definition are same as c++ version # Specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
if with_gpu: if with_gpu:
param['tree_method'] = 'gpu_hist' param['tree_method'] = 'hist'
param['gpu_id'] = rank param['device'] = f"cuda:{rank}"
# Specify validations set to watch performance # Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')] watchlist = [(dtest, 'eval'), (dtrain, 'train')]