diff --git a/demo/nvflare/.gitignore b/demo/nvflare/.gitignore new file mode 100644 index 000000000..d5702b886 --- /dev/null +++ b/demo/nvflare/.gitignore @@ -0,0 +1 @@ +!config diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json new file mode 100644 index 000000000..cfe294172 --- /dev/null +++ b/demo/nvflare/config/config_fed_client.json @@ -0,0 +1,23 @@ +{ + "format_version": 2, + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "trainer.XGBoostTrainer", + "args": { + "server_address": "localhost:9091", + "world_size": 2, + "server_cert_path": "server-cert.pem", + "client_key_path": "client-key.pem", + "client_cert_path": "client-cert.pem", + "use_gpus": false + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [] +} diff --git a/demo/nvflare/config/config_fed_server.json b/demo/nvflare/config/config_fed_server.json new file mode 100644 index 000000000..32993b652 --- /dev/null +++ b/demo/nvflare/config/config_fed_server.json @@ -0,0 +1,22 @@ +{ + "format_version": 2, + "server": { + "heart_beat_timeout": 600 + }, + "task_data_filters": [], + "task_result_filters": [], + "workflows": [ + { + "id": "server_workflow", + "path": "controller.XGBoostController", + "args": { + "port": 9091, + "world_size": 2, + "server_key_path": "server-key.pem", + "server_cert_path": "server-cert.pem", + "client_cert_path": "client-cert.pem" + } + } + ], + "components": [] +} diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md index 744e90915..19ac4cf4e 100644 --- a/demo/nvflare/horizontal/README.md +++ b/demo/nvflare/horizontal/README.md @@ -6,7 +6,7 @@ This directory contains a demo of Horizontal Federated Learning using ## Training with CPU only To run the demo, first build XGBoost with the federated learning plugin enabled (see the -[README](../../plugin/federated/README.md)). +[README](../../../plugin/federated/README.md)). Install NVFlare (note that currently NVFlare only supports Python 3.8): ```shell diff --git a/demo/nvflare/horizontal/prepare_data.sh b/demo/nvflare/horizontal/prepare_data.sh index eed1390b5..eb3a19d50 100755 --- a/demo/nvflare/horizontal/prepare_data.sh +++ b/demo/nvflare/horizontal/prepare_data.sh @@ -16,7 +16,7 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test nvflare poc -n 2 --prepare mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost -cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost +cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/ for (( site=1; site<=world_size; site++ )); do cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/ diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md index 83c3111b6..f9cca57d9 100644 --- a/demo/nvflare/vertical/README.md +++ b/demo/nvflare/vertical/README.md @@ -6,7 +6,7 @@ This directory contains a demo of Vertical Federated Learning using ## Training with CPU only To run the demo, first build XGBoost with the federated learning plugin enabled (see the -[README](../../plugin/federated/README.md)). +[README](../../../plugin/federated/README.md)). Install NVFlare (note that currently NVFlare only supports Python 3.8): ```shell diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py index cd420129c..1c235a439 100644 --- a/demo/nvflare/vertical/custom/trainer.py +++ b/demo/nvflare/vertical/custom/trainer.py @@ -16,7 +16,7 @@ class SupportedTasks(object): class XGBoostTrainer(Executor): def __init__(self, server_address: str, world_size: int, server_cert_path: str, - client_key_path: str, client_cert_path: str): + client_key_path: str, client_cert_path: str, use_gpus: bool): """Trainer for federated XGBoost. Args: @@ -32,6 +32,7 @@ class XGBoostTrainer(Executor): self._server_cert_path = server_cert_path self._client_key_path = client_key_path self._client_cert_path = client_cert_path + self._use_gpus = use_gpus def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: @@ -81,6 +82,8 @@ class XGBoostTrainer(Executor): 'objective': 'binary:logistic', 'eval_metric': 'auc', } + if self._use_gpus: + self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost') # specify validations set to watch performance watchlist = [(dtest, "eval"), (dtrain, "train")] diff --git a/demo/nvflare/vertical/prepare_data.sh b/demo/nvflare/vertical/prepare_data.sh index 86ec3dfa2..398ba2a10 100755 --- a/demo/nvflare/vertical/prepare_data.sh +++ b/demo/nvflare/vertical/prepare_data.sh @@ -56,7 +56,7 @@ fi nvflare poc -n 2 --prepare mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost -cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost +cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/ for (( site=1; site<=world_size; site++ )); do cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/ diff --git a/tests/test_distributed/test_federated/runtests-federated.sh b/tests/test_distributed/test_federated/runtests-federated.sh index 81a40c350..8bdb2bc5b 100755 --- a/tests/test_distributed/test_federated/runtests-federated.sh +++ b/tests/test_distributed/test_federated/runtests-federated.sh @@ -11,7 +11,7 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost" # Split train and test files manually to simulate a federated environment. -split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.train agaricus.txt.train- -split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.test agaricus.txt.test- +split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.train agaricus.txt.train- +split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.test agaricus.txt.test- python test_federated.py "${world_size}" diff --git a/tests/test_distributed/test_federated/test_federated.py b/tests/test_distributed/test_federated/test_federated.py index 9b8e55915..dba797078 100644 --- a/tests/test_distributed/test_federated/test_federated.py +++ b/tests/test_distributed/test_federated/test_federated.py @@ -35,14 +35,14 @@ def run_worker(port: int, world_size: int, rank: int, with_ssl: bool, with_gpu: # Always call this before using distributed module with xgb.collective.CommunicatorContext(**communicator_env): # Load file, file will not be sharded in federated mode. - dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank) - dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank) + dtrain = xgb.DMatrix('agaricus.txt.train-%02d?format=libsvm' % rank) + dtest = xgb.DMatrix('agaricus.txt.test-%02d?format=libsvm' % rank) # Specify parameters via map, definition are same as c++ version param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} if with_gpu: - param['tree_method'] = 'gpu_hist' - param['gpu_id'] = rank + param['tree_method'] = 'hist' + param['device'] = f"cuda:{rank}" # Specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')]