merge 23Mar01

2023-05-02 00:05:58 +02:00
parent 313a74b582 08ce495b5d
commit 5446c501af
258 changed files with 7471 additions and 5379 deletions
--- a/demo/CLI/binary_classification/mushroom.conf
+++ b/demo/CLI/binary_classification/mushroom.conf
@@ -20,10 +20,10 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 2
 # The path of training data
-data = "agaricus.txt.train"
+data = "agaricus.txt.train?format=libsvm"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test"
+eval[test] = "agaricus.txt.test?format=libsvm"
 # evaluate on training data as well each round
 eval_train = 1
 # The path of test data
-test:data = "agaricus.txt.test"
+test:data = "agaricus.txt.test?format=libsvm"
--- a/demo/CLI/regression/machine.conf
+++ b/demo/CLI/regression/machine.conf
@@ -21,8 +21,8 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0
 # The path of training data
-data = "machine.txt.train"
+data = "machine.txt.train?format=libsvm"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "machine.txt.test"
+eval[test] = "machine.txt.test?format=libsvm"
 # The path of test data
-test:data = "machine.txt.test"
+test:data = "machine.txt.test?format=libsvm"
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@@ -42,8 +42,8 @@ int main() {

  // load the data
  DMatrixHandle dtrain, dtest;
-  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
-  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));
+  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train?format=libsvm", silent, &dtrain));
+  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test?format=libsvm", silent, &dtest));

  // create the booster
  BoosterHandle booster;
--- a/demo/guide-python/boost_from_prediction.py
+++ b/demo/guide-python/boost_from_prediction.py
@@ -7,15 +7,19 @@ import os
 import xgboost as xgb

 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 ###
 # advanced: start from a initial base prediction
 #
-print('start running example to start from a initial prediction')
+print("start running example to start from a initial prediction")
 # specify parameters via map, definition are same as c++ version
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
 # train xgboost for 1 round
 bst = xgb.train(param, dtrain, 1, watchlist)
 # Note: we need the margin value instead of transformed prediction in
@@ -27,5 +31,5 @@ ptest = bst.predict(dtest, output_margin=True)
 dtrain.set_base_margin(ptrain)
 dtest.set_base_margin(ptest)

-print('this is result of running from initial prediction')
+print("this is result of running from initial prediction")
 bst = xgb.train(param, dtrain, 1, watchlist)
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -10,27 +10,45 @@ import xgboost as xgb

 # load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
 num_round = 2

-print('running cross validation')
+print("running cross validation")
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'error'}, seed=0,
-       callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)])
+xgb.cv(
+    param,
+    dtrain,
+    num_round,
+    nfold=5,
+    metrics={"error"},
+    seed=0,
+    callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
+)

-print('running cross validation, disable standard deviation display')
+print("running cross validation, disable standard deviation display")
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value
-res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
-             metrics={'error'}, seed=0,
-             callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
-                        xgb.callback.EarlyStopping(3)])
+res = xgb.cv(
+    param,
+    dtrain,
+    num_boost_round=10,
+    nfold=5,
+    metrics={"error"},
+    seed=0,
+    callbacks=[
+        xgb.callback.EvaluationMonitor(show_stdv=False),
+        xgb.callback.EarlyStopping(3),
+    ],
+)
 print(res)
-print('running cross validation, with preprocessing function')
+print("running cross validation, with preprocessing function")
+
+
 # define the preprocessing function
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
@@ -38,32 +56,36 @@ print('running cross validation, with preprocessing function')
 def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
-    param['scale_pos_weight'] = ratio
+    param["scale_pos_weight"] = ratio
    return (dtrain, dtest, param)

+
 # do cross validation, for each fold
 # the dtrain, dtest, param will be passed into fpreproc
 # then the return value of fpreproc will be used to generate
 # results of that fold
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'auc'}, seed=0, fpreproc=fpreproc)
+xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc)

 ###
 # you can also do cross validation with customized loss function
 # See custom_objective.py
 ##
-print('running cross validation, with customized loss function')
+print("running cross validation, with customized loss function")
+
+
 def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess
+
+
 def evalerror(preds, dtrain):
    labels = dtrain.get_label()
-    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+    return "error", float(sum(labels != (preds > 0.0))) / len(labels)

-param = {'max_depth':2, 'eta':1}
+
+param = {"max_depth": 2, "eta": 1}
 # train with customized objective
-xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
-       obj=logregobj, feval=evalerror)
+xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
--- a/demo/guide-python/evals_result.py
+++ b/demo/guide-python/evals_result.py
@@ -7,28 +7,37 @@ import os
 import xgboost as xgb

 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)

-param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
+param = [
+    ("max_depth", 2),
+    ("objective", "binary:logistic"),
+    ("eval_metric", "logloss"),
+    ("eval_metric", "error"),
+]

 num_round = 2
-watchlist = [(dtest,'eval'), (dtrain,'train')]
+watchlist = [(dtest, "eval"), (dtrain, "train")]

 evals_result = {}
 bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)

-print('Access logloss metric directly from evals_result:')
-print(evals_result['eval']['logloss'])
+print("Access logloss metric directly from evals_result:")
+print(evals_result["eval"]["logloss"])

-print('')
-print('Access metrics through a loop:')
+print("")
+print("Access metrics through a loop:")
 for e_name, e_mtrs in evals_result.items():
-    print('- {}'.format(e_name))
+    print("- {}".format(e_name))
    for e_mtr_name, e_mtr_vals in e_mtrs.items():
-        print('   - {}'.format(e_mtr_name))
-        print('      - {}'.format(e_mtr_vals))
+        print("   - {}".format(e_mtr_name))
+        print("      - {}".format(e_mtr_vals))

-print('')
-print('Access complete dictionary:')
+print("")
+print("Access complete dictionary:")
 print(evals_result)
--- a/demo/guide-python/generalized_linear_model.py
+++ b/demo/guide-python/generalized_linear_model.py
@@ -11,14 +11,22 @@ import xgboost as xgb
 #  basically, we are using linear model, instead of tree for our boosters
 ##
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
 # change booster to gblinear, so that we are fitting a linear model
 # alpha is the L1 regularizer
 # lambda is the L2 regularizer
 # you can also set lambda_bias which is L2 regularizer on the bias term
-param = {'objective':'binary:logistic', 'booster':'gblinear',
-         'alpha': 0.0001, 'lambda': 1}
+param = {
+    "objective": "binary:logistic",
+    "booster": "gblinear",
+    "alpha": 0.0001,
+    "lambda": 1,
+}

 # normally, you do not need to set eta (step_size)
 # XGBoost uses a parallel coordinate descent algorithm (shotgun),
@@ -29,9 +37,15 @@ param = {'objective':'binary:logistic', 'booster':'gblinear',
 ##
 # the rest of settings are the same
 ##
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 num_round = 4
 bst = xgb.train(param, dtrain, num_round, watchlist)
 preds = bst.predict(dtest)
 labels = dtest.get_label()
-print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
+print(
+    "error=%f"
+    % (
+        sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
+        / float(len(preds))
+    )
+)
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -16,8 +16,8 @@ test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test")

 def native_interface():
    # load data in do training
-    dtrain = xgb.DMatrix(train)
-    dtest = xgb.DMatrix(test)
+    dtrain = xgb.DMatrix(train + "?format=libsvm")
+    dtest = xgb.DMatrix(test + "?format=libsvm")
    param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
    watchlist = [(dtest, "eval"), (dtrain, "train")]
    num_round = 3
--- a/demo/guide-python/predict_leaf_indices.py
+++ b/demo/guide-python/predict_leaf_indices.py
@@ -8,14 +8,18 @@ import xgboost as xgb

 # load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 num_round = 3
 bst = xgb.train(param, dtrain, num_round, watchlist)

-print('start testing predict the leaf indices')
+print("start testing predict the leaf indices")
 # predict using first 2 tree
 leafindex = bst.predict(
    dtest, iteration_range=(0, 2), pred_leaf=True, strict_shape=True
--- a/demo/nvflare/README.md
+++ b/demo/nvflare/README.md
@@ -3,61 +3,12 @@
 This directory contains a demo of Federated Learning using
 [NVFlare](https://nvidia.github.io/NVFlare/).

-## Training with CPU only
+## Horizontal Federated XGBoost

-To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+For horizontal federated learning using XGBoost (data is split row-wise), check out the `horizontal` directory
+(see the [README](horizontal/README.md)).

-Install NVFlare (note that currently NVFlare only supports Python 3.8):
-```shell
-pip install nvflare
-```
+## Vertical Federated XGBoost

-Prepare the data:
-```shell
-./prepare_data.sh
-```
-
-Start the NVFlare federated server:
-```shell
-/tmp/nvflare/poc/server/startup/start.sh
-```
-
-In another terminal, start the first worker:
-```shell
-/tmp/nvflare/poc/site-1/startup/start.sh
-```
-
-And the second worker:
-```shell
-/tmp/nvflare/poc/site-2/startup/start.sh
-```
-
-Then start the admin CLI:
-```shell
-/tmp/nvflare/poc/admin/startup/fl_admin.sh
-```
-
-In the admin CLI, run the following command:
-```shell
-submit_job hello-xgboost
-```
-
-Once the training finishes, the model file should be written into
-`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
-respectively.
-
-Finally, shutdown everything from the admin CLI, using `admin` as password:
-```shell
-shutdown client
-shutdown server
-```
-
-## Training with GPUs
-
-To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
-Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
-turned off (see the [README](../../plugin/federated/README.md)).
-
-Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
-above.
+For vertical federated learning using XGBoost (data is split column-wise), check out the `vertical` directory
+(see the [README](vertical/README.md)).
--- a/demo/nvflare/config/config_fed_client.json
+++ b/demo/nvflare/config/config_fed_client.json
@@ -1,23 +0,0 @@
-{
-  "format_version": 2,
-  "executors": [
-    {
-      "tasks": [
-        "train"
-      ],
-      "executor": {
-        "path": "trainer.XGBoostTrainer",
-        "args": {
-          "server_address": "localhost:9091",
-          "world_size": 2,
-          "server_cert_path": "server-cert.pem",
-          "client_key_path": "client-key.pem",
-          "client_cert_path": "client-cert.pem",
-          "use_gpus": "false"
-        }
-      }
-    }
-  ],
-  "task_result_filters": [],
-  "task_data_filters": []
-}
--- a/demo/nvflare/config/config_fed_server.json
+++ b/demo/nvflare/config/config_fed_server.json
@@ -1,22 +0,0 @@
-{
-  "format_version": 2,
-  "server": {
-    "heart_beat_timeout": 600
-  },
-  "task_data_filters": [],
-  "task_result_filters": [],
-  "workflows": [
-    {
-      "id": "server_workflow",
-      "path": "controller.XGBoostController",
-      "args": {
-        "port": 9091,
-        "world_size": 2,
-        "server_key_path": "server-key.pem",
-        "server_cert_path": "server-cert.pem",
-        "client_cert_path": "client-cert.pem"
-      }
-    }
-  ],
-  "components": []
-}
--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@@ -0,0 +1,63 @@
+# Experimental Support of Horizontal Federated XGBoost using NVFlare
+
+This directory contains a demo of Horizontal Federated Learning using
+[NVFlare](https://nvidia.github.io/NVFlare/).
+
+## Training with CPU only
+
+To run the demo, first build XGBoost with the federated learning plugin enabled (see the
+[README](../../plugin/federated/README.md)).
+
+Install NVFlare (note that currently NVFlare only supports Python 3.8):
+```shell
+pip install nvflare
+```
+
+Prepare the data:
+```shell
+./prepare_data.sh
+```
+
+Start the NVFlare federated server:
+```shell
+/tmp/nvflare/poc/server/startup/start.sh
+```
+
+In another terminal, start the first worker:
+```shell
+/tmp/nvflare/poc/site-1/startup/start.sh
+```
+
+And the second worker:
+```shell
+/tmp/nvflare/poc/site-2/startup/start.sh
+```
+
+Then start the admin CLI:
+```shell
+/tmp/nvflare/poc/admin/startup/fl_admin.sh
+```
+
+In the admin CLI, run the following command:
+```shell
+submit_job horizontal-xgboost
+```
+
+Once the training finishes, the model file should be written into
+`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
+respectively.
+
+Finally, shutdown everything from the admin CLI, using `admin` as password:
+```shell
+shutdown client
+shutdown server
+```
+
+## Training with GPUs
+
+To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
+turned off (see the [README](../../plugin/federated/README.md)).
+
+Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
--- a/demo/nvflare/horizontal/custom/controller.py
+++ b/demo/nvflare/horizontal/custom/controller.py
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
--- a/demo/nvflare/horizontal/prepare_data.sh
+++ b/demo/nvflare/horizontal/prepare_data.sh
@@ -15,8 +15,8 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train ag
 split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-

 nvflare poc -n 2 --prepare
-mkdir -p /tmp/nvflare/poc/admin/transfer/hello-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/hello-xgboost
+mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
+cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for id in $(eval echo "{1..$world_size}"); do
  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
--- a/demo/nvflare/vertical/README.md
+++ b/demo/nvflare/vertical/README.md
@@ -0,0 +1,59 @@
+# Experimental Support of Vertical Federated XGBoost using NVFlare
+
+This directory contains a demo of Vertical Federated Learning using
+[NVFlare](https://nvidia.github.io/NVFlare/).
+
+## Training with CPU only
+
+To run the demo, first build XGBoost with the federated learning plugin enabled (see the
+[README](../../plugin/federated/README.md)).
+
+Install NVFlare (note that currently NVFlare only supports Python 3.8):
+```shell
+pip install nvflare
+```
+
+Prepare the data (note that this step will download the HIGGS dataset, which is 2.6GB compressed, and 7.5GB
+uncompressed, so make sure you have enough disk space and are on a fast internet connection):
+```shell
+./prepare_data.sh
+```
+
+Start the NVFlare federated server:
+```shell
+/tmp/nvflare/poc/server/startup/start.sh
+```
+
+In another terminal, start the first worker:
+```shell
+/tmp/nvflare/poc/site-1/startup/start.sh
+```
+
+And the second worker:
+```shell
+/tmp/nvflare/poc/site-2/startup/start.sh
+```
+
+Then start the admin CLI:
+```shell
+/tmp/nvflare/poc/admin/startup/fl_admin.sh
+```
+
+In the admin CLI, run the following command:
+```shell
+submit_job vertical-xgboost
+```
+
+Once the training finishes, the model file should be written into
+`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
+respectively.
+
+Finally, shutdown everything from the admin CLI, using `admin` as password:
+```shell
+shutdown client
+shutdown server
+```
+
+## Training with GPUs
+
+Currently GPUs are not yet supported by vertical federated XGBoost.
--- a/demo/nvflare/vertical/custom/controller.py
+++ b/demo/nvflare/vertical/custom/controller.py
@@ -0,0 +1,68 @@
+"""
+Example of training controller with NVFlare
+===========================================
+"""
+import multiprocessing
+
+from nvflare.apis.client import Client
+from nvflare.apis.fl_context import FLContext
+from nvflare.apis.impl.controller import Controller, Task
+from nvflare.apis.shareable import Shareable
+from nvflare.apis.signal import Signal
+from trainer import SupportedTasks
+
+import xgboost.federated
+
+
+class XGBoostController(Controller):
+    def __init__(self, port: int, world_size: int, server_key_path: str,
+                 server_cert_path: str, client_cert_path: str):
+        """Controller for federated XGBoost.
+
+        Args:
+            port: the port for the gRPC server to listen on.
+            world_size: the number of sites.
+            server_key_path: the path to the server key file.
+            server_cert_path: the path to the server certificate file.
+            client_cert_path: the path to the client certificate file.
+        """
+        super().__init__()
+        self._port = port
+        self._world_size = world_size
+        self._server_key_path = server_key_path
+        self._server_cert_path = server_cert_path
+        self._client_cert_path = client_cert_path
+        self._server = None
+
+    def start_controller(self, fl_ctx: FLContext):
+        self._server = multiprocessing.Process(
+            target=xgboost.federated.run_federated_server,
+            args=(self._port, self._world_size, self._server_key_path,
+                  self._server_cert_path, self._client_cert_path))
+        self._server.start()
+
+    def stop_controller(self, fl_ctx: FLContext):
+        if self._server:
+            self._server.terminate()
+
+    def process_result_of_unknown_task(self, client: Client, task_name: str,
+                                       client_task_id: str, result: Shareable,
+                                       fl_ctx: FLContext):
+        self.log_warning(fl_ctx, f"Unknown task: {task_name} from client {client.name}.")
+
+    def control_flow(self, abort_signal: Signal, fl_ctx: FLContext):
+        self.log_info(fl_ctx, "XGBoost training control flow started.")
+        if abort_signal.triggered:
+            return
+        task = Task(name=SupportedTasks.TRAIN, data=Shareable())
+        self.broadcast_and_wait(
+            task=task,
+            min_responses=self._world_size,
+            fl_ctx=fl_ctx,
+            wait_time_after_min_received=1,
+            abort_signal=abort_signal,
+        )
+        if abort_signal.triggered:
+            return
+
+        self.log_info(fl_ctx, "XGBoost training control flow finished.")
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -0,0 +1,97 @@
+import os
+
+from nvflare.apis.executor import Executor
+from nvflare.apis.fl_constant import FLContextKey, ReturnCode
+from nvflare.apis.fl_context import FLContext
+from nvflare.apis.shareable import Shareable, make_reply
+from nvflare.apis.signal import Signal
+
+import xgboost as xgb
+from xgboost import callback
+
+
+class SupportedTasks(object):
+    TRAIN = "train"
+
+
+class XGBoostTrainer(Executor):
+    def __init__(self, server_address: str, world_size: int, server_cert_path: str,
+                 client_key_path: str, client_cert_path: str):
+        """Trainer for federated XGBoost.
+
+        Args:
+            server_address: address for the gRPC server to connect to.
+            world_size: the number of sites.
+            server_cert_path: the path to the server certificate file.
+            client_key_path: the path to the client key file.
+            client_cert_path: the path to the client certificate file.
+        """
+        super().__init__()
+        self._server_address = server_address
+        self._world_size = world_size
+        self._server_cert_path = server_cert_path
+        self._client_key_path = client_key_path
+        self._client_cert_path = client_cert_path
+
+    def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
+                abort_signal: Signal) -> Shareable:
+        self.log_info(fl_ctx, f"Executing {task_name}")
+        try:
+            if task_name == SupportedTasks.TRAIN:
+                self._do_training(fl_ctx)
+                return make_reply(ReturnCode.OK)
+            else:
+                self.log_error(fl_ctx, f"{task_name} is not a supported task.")
+                return make_reply(ReturnCode.TASK_UNKNOWN)
+        except BaseException as e:
+            self.log_exception(fl_ctx,
+                               f"Task {task_name} failed. Exception: {e.__str__()}")
+            return make_reply(ReturnCode.EXECUTION_EXCEPTION)
+
+    def _do_training(self, fl_ctx: FLContext):
+        client_name = fl_ctx.get_prop(FLContextKey.CLIENT_NAME)
+        rank = int(client_name.split('-')[1]) - 1
+        communicator_env = {
+            'xgboost_communicator': 'federated',
+            'federated_server_address': self._server_address,
+            'federated_world_size': self._world_size,
+            'federated_rank': rank,
+            'federated_server_cert': self._server_cert_path,
+            'federated_client_key': self._client_key_path,
+            'federated_client_cert': self._client_cert_path
+        }
+        with xgb.collective.CommunicatorContext(**communicator_env):
+            # Load file, file will not be sharded in federated mode.
+            if rank == 0:
+                label = '&label_column=0'
+            else:
+                label = ''
+            dtrain = xgb.DMatrix(f'higgs.train.csv?format=csv{label}', data_split_mode=1)
+            dtest = xgb.DMatrix(f'higgs.test.csv?format=csv{label}', data_split_mode=1)
+
+            # specify parameters via map
+            param = {
+                'validate_parameters': True,
+                'eta': 0.1,
+                'gamma': 1.0,
+                'max_depth': 8,
+                'min_child_weight': 100,
+                'tree_method': 'approx',
+                'grow_policy': 'depthwise',
+                'objective': 'binary:logistic',
+                'eval_metric': 'auc',
+            }
+
+            # specify validations set to watch performance
+            watchlist = [(dtest, "eval"), (dtrain, "train")]
+            # number of boosting rounds
+            num_round = 10
+
+            bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2)
+
+            # Save the model.
+            workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
+            run_number = fl_ctx.get_prop(FLContextKey.CURRENT_RUN)
+            run_dir = workspace.get_run_dir(run_number)
+            bst.save_model(os.path.join(run_dir, "higgs.model.federated.vertical.json"))
+            xgb.collective.communicator_print("Finished training\n")
--- a/demo/nvflare/vertical/prepare_data.sh
+++ b/demo/nvflare/vertical/prepare_data.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -e
+
+rm -fr ./*.pem /tmp/nvflare/poc
+
+world_size=2
+
+# Generate server and client certificates.
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out server-cert.pem -subj "/C=US/CN=localhost"
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
+
+# Download HIGGS dataset.
+if [ -f "HIGGS.csv" ]; then
+  echo "HIGGS.csv exists, skipping download."
+else
+  echo "Downloading HIGGS dataset."
+  wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
+  gunzip HIGGS.csv.gz
+fi
+
+# Split into train/test.
+if [[ -f higgs.train.csv && -f higgs.test.csv ]]; then
+  echo "higgs.train.csv and higgs.test.csv exist, skipping split."
+else
+  echo "Splitting HIGGS dataset into train/test."
+  head -n 10450000 HIGGS.csv > higgs.train.csv
+  tail -n 550000 HIGGS.csv > higgs.test.csv
+fi
+
+# Split train and test files by column to simulate a federated environment.
+site_files=(higgs.{train,test}.csv-site-*)
+if [ ${#site_files[@]} -eq $((world_size*2)) ]; then
+  echo "Site files exist, skipping split."
+else
+  echo "Splitting train/test into site files."
+  total_cols=28  # plus label
+  cols=$((total_cols/world_size))
+  echo "Columns per site: $cols"
+  for (( site=1; site<=world_size; site++ )); do
+    if (( site == 1 )); then
+      start=$((cols*(site-1)+1))
+    else
+      start=$((cols*(site-1)+2))
+    fi
+    if (( site == world_size )); then
+      end=$((total_cols+1))
+    else
+      end=$((cols*site+1))
+    fi
+    echo "Site $site, columns $start-$end"
+    cut -d, -f${start}-${end} higgs.train.csv > higgs.train.csv-site-"${site}"
+    cut -d, -f${start}-${end} higgs.test.csv > higgs.test.csv-site-"${site}"
+  done
+fi
+
+nvflare poc -n 2 --prepare
+mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
+for (( site=1; site<=world_size; site++ )); do
+  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
+  ln -s "${PWD}"/higgs.train.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.train.csv
+  ln -s "${PWD}"/higgs.test.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.test.csv
+done