merge 23Mar01
This commit is contained in:
@@ -20,10 +20,10 @@ num_round = 2
|
||||
# 0 means do not save any model except the final round model
|
||||
save_period = 2
|
||||
# The path of training data
|
||||
data = "agaricus.txt.train"
|
||||
data = "agaricus.txt.train?format=libsvm"
|
||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||
eval[test] = "agaricus.txt.test"
|
||||
eval[test] = "agaricus.txt.test?format=libsvm"
|
||||
# evaluate on training data as well each round
|
||||
eval_train = 1
|
||||
# The path of test data
|
||||
test:data = "agaricus.txt.test"
|
||||
test:data = "agaricus.txt.test?format=libsvm"
|
||||
|
||||
@@ -21,8 +21,8 @@ num_round = 2
|
||||
# 0 means do not save any model except the final round model
|
||||
save_period = 0
|
||||
# The path of training data
|
||||
data = "machine.txt.train"
|
||||
data = "machine.txt.train?format=libsvm"
|
||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||
eval[test] = "machine.txt.test"
|
||||
eval[test] = "machine.txt.test?format=libsvm"
|
||||
# The path of test data
|
||||
test:data = "machine.txt.test"
|
||||
test:data = "machine.txt.test?format=libsvm"
|
||||
|
||||
@@ -42,8 +42,8 @@ int main() {
|
||||
|
||||
// load the data
|
||||
DMatrixHandle dtrain, dtest;
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train?format=libsvm", silent, &dtrain));
|
||||
safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test?format=libsvm", silent, &dtest));
|
||||
|
||||
// create the booster
|
||||
BoosterHandle booster;
|
||||
|
||||
@@ -7,15 +7,19 @@ import os
|
||||
import xgboost as xgb
|
||||
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
dtrain = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
||||
)
|
||||
dtest = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
|
||||
)
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print('start running example to start from a initial prediction')
|
||||
print("start running example to start from a initial prediction")
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
||||
# train xgboost for 1 round
|
||||
bst = xgb.train(param, dtrain, 1, watchlist)
|
||||
# Note: we need the margin value instead of transformed prediction in
|
||||
@@ -27,5 +31,5 @@ ptest = bst.predict(dtest, output_margin=True)
|
||||
dtrain.set_base_margin(ptrain)
|
||||
dtest.set_base_margin(ptest)
|
||||
|
||||
print('this is result of running from initial prediction')
|
||||
print("this is result of running from initial prediction")
|
||||
bst = xgb.train(param, dtrain, 1, watchlist)
|
||||
|
||||
@@ -10,27 +10,45 @@ import xgboost as xgb
|
||||
|
||||
# load data in do training
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
|
||||
dtrain = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
||||
)
|
||||
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
||||
num_round = 2
|
||||
|
||||
print('running cross validation')
|
||||
print("running cross validation")
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed=0,
|
||||
callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)])
|
||||
xgb.cv(
|
||||
param,
|
||||
dtrain,
|
||||
num_round,
|
||||
nfold=5,
|
||||
metrics={"error"},
|
||||
seed=0,
|
||||
callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
|
||||
)
|
||||
|
||||
print('running cross validation, disable standard deviation display')
|
||||
print("running cross validation, disable standard deviation display")
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value
|
||||
res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
|
||||
metrics={'error'}, seed=0,
|
||||
callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
|
||||
xgb.callback.EarlyStopping(3)])
|
||||
res = xgb.cv(
|
||||
param,
|
||||
dtrain,
|
||||
num_boost_round=10,
|
||||
nfold=5,
|
||||
metrics={"error"},
|
||||
seed=0,
|
||||
callbacks=[
|
||||
xgb.callback.EvaluationMonitor(show_stdv=False),
|
||||
xgb.callback.EarlyStopping(3),
|
||||
],
|
||||
)
|
||||
print(res)
|
||||
print('running cross validation, with preprocessing function')
|
||||
print("running cross validation, with preprocessing function")
|
||||
|
||||
|
||||
# define the preprocessing function
|
||||
# used to return the preprocessed training, test data, and parameter
|
||||
# we can use this to do weight rescale, etc.
|
||||
@@ -38,32 +56,36 @@ print('running cross validation, with preprocessing function')
|
||||
def fpreproc(dtrain, dtest, param):
|
||||
label = dtrain.get_label()
|
||||
ratio = float(np.sum(label == 0)) / np.sum(label == 1)
|
||||
param['scale_pos_weight'] = ratio
|
||||
param["scale_pos_weight"] = ratio
|
||||
return (dtrain, dtest, param)
|
||||
|
||||
|
||||
# do cross validation, for each fold
|
||||
# the dtrain, dtest, param will be passed into fpreproc
|
||||
# then the return value of fpreproc will be used to generate
|
||||
# results of that fold
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'auc'}, seed=0, fpreproc=fpreproc)
|
||||
xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc)
|
||||
|
||||
###
|
||||
# you can also do cross validation with customized loss function
|
||||
# See custom_objective.py
|
||||
##
|
||||
print('running cross validation, with customized loss function')
|
||||
print("running cross validation, with customized loss function")
|
||||
|
||||
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0 - preds)
|
||||
return grad, hess
|
||||
|
||||
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
param = {'max_depth':2, 'eta':1}
|
||||
|
||||
param = {"max_depth": 2, "eta": 1}
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
|
||||
obj=logregobj, feval=evalerror)
|
||||
xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
|
||||
|
||||
@@ -7,28 +7,37 @@ import os
|
||||
import xgboost as xgb
|
||||
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
dtrain = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
||||
)
|
||||
dtest = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
|
||||
)
|
||||
|
||||
param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
|
||||
param = [
|
||||
("max_depth", 2),
|
||||
("objective", "binary:logistic"),
|
||||
("eval_metric", "logloss"),
|
||||
("eval_metric", "error"),
|
||||
]
|
||||
|
||||
num_round = 2
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
|
||||
evals_result = {}
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)
|
||||
|
||||
print('Access logloss metric directly from evals_result:')
|
||||
print(evals_result['eval']['logloss'])
|
||||
print("Access logloss metric directly from evals_result:")
|
||||
print(evals_result["eval"]["logloss"])
|
||||
|
||||
print('')
|
||||
print('Access metrics through a loop:')
|
||||
print("")
|
||||
print("Access metrics through a loop:")
|
||||
for e_name, e_mtrs in evals_result.items():
|
||||
print('- {}'.format(e_name))
|
||||
print("- {}".format(e_name))
|
||||
for e_mtr_name, e_mtr_vals in e_mtrs.items():
|
||||
print(' - {}'.format(e_mtr_name))
|
||||
print(' - {}'.format(e_mtr_vals))
|
||||
print(" - {}".format(e_mtr_name))
|
||||
print(" - {}".format(e_mtr_vals))
|
||||
|
||||
print('')
|
||||
print('Access complete dictionary:')
|
||||
print("")
|
||||
print("Access complete dictionary:")
|
||||
print(evals_result)
|
||||
|
||||
@@ -11,14 +11,22 @@ import xgboost as xgb
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
##
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
dtrain = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
||||
)
|
||||
dtest = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
|
||||
)
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param = {'objective':'binary:logistic', 'booster':'gblinear',
|
||||
'alpha': 0.0001, 'lambda': 1}
|
||||
param = {
|
||||
"objective": "binary:logistic",
|
||||
"booster": "gblinear",
|
||||
"alpha": 0.0001,
|
||||
"lambda": 1,
|
||||
}
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
@@ -29,9 +37,15 @@ param = {'objective':'binary:logistic', 'booster':'gblinear',
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
num_round = 4
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
|
||||
print(
|
||||
"error=%f"
|
||||
% (
|
||||
sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
|
||||
/ float(len(preds))
|
||||
)
|
||||
)
|
||||
|
||||
@@ -16,8 +16,8 @@ test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test")
|
||||
|
||||
def native_interface():
|
||||
# load data in do training
|
||||
dtrain = xgb.DMatrix(train)
|
||||
dtest = xgb.DMatrix(test)
|
||||
dtrain = xgb.DMatrix(train + "?format=libsvm")
|
||||
dtest = xgb.DMatrix(test + "?format=libsvm")
|
||||
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
num_round = 3
|
||||
|
||||
@@ -8,14 +8,18 @@ import xgboost as xgb
|
||||
|
||||
# load data in do training
|
||||
CURRENT_DIR = os.path.dirname(__file__)
|
||||
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
|
||||
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
dtrain = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
||||
)
|
||||
dtest = xgb.DMatrix(
|
||||
os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
|
||||
)
|
||||
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
num_round = 3
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
print('start testing predict the leaf indices')
|
||||
print("start testing predict the leaf indices")
|
||||
# predict using first 2 tree
|
||||
leafindex = bst.predict(
|
||||
dtest, iteration_range=(0, 2), pred_leaf=True, strict_shape=True
|
||||
|
||||
@@ -3,61 +3,12 @@
|
||||
This directory contains a demo of Federated Learning using
|
||||
[NVFlare](https://nvidia.github.io/NVFlare/).
|
||||
|
||||
## Training with CPU only
|
||||
## Horizontal Federated XGBoost
|
||||
|
||||
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
||||
[README](../../plugin/federated/README.md)).
|
||||
For horizontal federated learning using XGBoost (data is split row-wise), check out the `horizontal` directory
|
||||
(see the [README](horizontal/README.md)).
|
||||
|
||||
Install NVFlare (note that currently NVFlare only supports Python 3.8):
|
||||
```shell
|
||||
pip install nvflare
|
||||
```
|
||||
## Vertical Federated XGBoost
|
||||
|
||||
Prepare the data:
|
||||
```shell
|
||||
./prepare_data.sh
|
||||
```
|
||||
|
||||
Start the NVFlare federated server:
|
||||
```shell
|
||||
/tmp/nvflare/poc/server/startup/start.sh
|
||||
```
|
||||
|
||||
In another terminal, start the first worker:
|
||||
```shell
|
||||
/tmp/nvflare/poc/site-1/startup/start.sh
|
||||
```
|
||||
|
||||
And the second worker:
|
||||
```shell
|
||||
/tmp/nvflare/poc/site-2/startup/start.sh
|
||||
```
|
||||
|
||||
Then start the admin CLI:
|
||||
```shell
|
||||
/tmp/nvflare/poc/admin/startup/fl_admin.sh
|
||||
```
|
||||
|
||||
In the admin CLI, run the following command:
|
||||
```shell
|
||||
submit_job hello-xgboost
|
||||
```
|
||||
|
||||
Once the training finishes, the model file should be written into
|
||||
`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
|
||||
respectively.
|
||||
|
||||
Finally, shutdown everything from the admin CLI, using `admin` as password:
|
||||
```shell
|
||||
shutdown client
|
||||
shutdown server
|
||||
```
|
||||
|
||||
## Training with GPUs
|
||||
|
||||
To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
|
||||
Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
|
||||
turned off (see the [README](../../plugin/federated/README.md)).
|
||||
|
||||
Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
|
||||
above.
|
||||
For vertical federated learning using XGBoost (data is split column-wise), check out the `vertical` directory
|
||||
(see the [README](vertical/README.md)).
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
{
|
||||
"format_version": 2,
|
||||
"executors": [
|
||||
{
|
||||
"tasks": [
|
||||
"train"
|
||||
],
|
||||
"executor": {
|
||||
"path": "trainer.XGBoostTrainer",
|
||||
"args": {
|
||||
"server_address": "localhost:9091",
|
||||
"world_size": 2,
|
||||
"server_cert_path": "server-cert.pem",
|
||||
"client_key_path": "client-key.pem",
|
||||
"client_cert_path": "client-cert.pem",
|
||||
"use_gpus": "false"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"task_result_filters": [],
|
||||
"task_data_filters": []
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
{
|
||||
"format_version": 2,
|
||||
"server": {
|
||||
"heart_beat_timeout": 600
|
||||
},
|
||||
"task_data_filters": [],
|
||||
"task_result_filters": [],
|
||||
"workflows": [
|
||||
{
|
||||
"id": "server_workflow",
|
||||
"path": "controller.XGBoostController",
|
||||
"args": {
|
||||
"port": 9091,
|
||||
"world_size": 2,
|
||||
"server_key_path": "server-key.pem",
|
||||
"server_cert_path": "server-cert.pem",
|
||||
"client_cert_path": "client-cert.pem"
|
||||
}
|
||||
}
|
||||
],
|
||||
"components": []
|
||||
}
|
||||
63
demo/nvflare/horizontal/README.md
Normal file
63
demo/nvflare/horizontal/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Experimental Support of Horizontal Federated XGBoost using NVFlare
|
||||
|
||||
This directory contains a demo of Horizontal Federated Learning using
|
||||
[NVFlare](https://nvidia.github.io/NVFlare/).
|
||||
|
||||
## Training with CPU only
|
||||
|
||||
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
||||
[README](../../plugin/federated/README.md)).
|
||||
|
||||
Install NVFlare (note that currently NVFlare only supports Python 3.8):
|
||||
```shell
|
||||
pip install nvflare
|
||||
```
|
||||
|
||||
Prepare the data:
|
||||
```shell
|
||||
./prepare_data.sh
|
||||
```
|
||||
|
||||
Start the NVFlare federated server:
|
||||
```shell
|
||||
/tmp/nvflare/poc/server/startup/start.sh
|
||||
```
|
||||
|
||||
In another terminal, start the first worker:
|
||||
```shell
|
||||
/tmp/nvflare/poc/site-1/startup/start.sh
|
||||
```
|
||||
|
||||
And the second worker:
|
||||
```shell
|
||||
/tmp/nvflare/poc/site-2/startup/start.sh
|
||||
```
|
||||
|
||||
Then start the admin CLI:
|
||||
```shell
|
||||
/tmp/nvflare/poc/admin/startup/fl_admin.sh
|
||||
```
|
||||
|
||||
In the admin CLI, run the following command:
|
||||
```shell
|
||||
submit_job horizontal-xgboost
|
||||
```
|
||||
|
||||
Once the training finishes, the model file should be written into
|
||||
`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
|
||||
respectively.
|
||||
|
||||
Finally, shutdown everything from the admin CLI, using `admin` as password:
|
||||
```shell
|
||||
shutdown client
|
||||
shutdown server
|
||||
```
|
||||
|
||||
## Training with GPUs
|
||||
|
||||
To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
|
||||
Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
|
||||
turned off (see the [README](../../plugin/federated/README.md)).
|
||||
|
||||
Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
|
||||
above.
|
||||
@@ -15,8 +15,8 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train ag
|
||||
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-
|
||||
|
||||
nvflare poc -n 2 --prepare
|
||||
mkdir -p /tmp/nvflare/poc/admin/transfer/hello-xgboost
|
||||
cp -fr config custom /tmp/nvflare/poc/admin/transfer/hello-xgboost
|
||||
mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
|
||||
for id in $(eval echo "{1..$world_size}"); do
|
||||
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
|
||||
59
demo/nvflare/vertical/README.md
Normal file
59
demo/nvflare/vertical/README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Experimental Support of Vertical Federated XGBoost using NVFlare
|
||||
|
||||
This directory contains a demo of Vertical Federated Learning using
|
||||
[NVFlare](https://nvidia.github.io/NVFlare/).
|
||||
|
||||
## Training with CPU only
|
||||
|
||||
To run the demo, first build XGBoost with the federated learning plugin enabled (see the
|
||||
[README](../../plugin/federated/README.md)).
|
||||
|
||||
Install NVFlare (note that currently NVFlare only supports Python 3.8):
|
||||
```shell
|
||||
pip install nvflare
|
||||
```
|
||||
|
||||
Prepare the data (note that this step will download the HIGGS dataset, which is 2.6GB compressed, and 7.5GB
|
||||
uncompressed, so make sure you have enough disk space and are on a fast internet connection):
|
||||
```shell
|
||||
./prepare_data.sh
|
||||
```
|
||||
|
||||
Start the NVFlare federated server:
|
||||
```shell
|
||||
/tmp/nvflare/poc/server/startup/start.sh
|
||||
```
|
||||
|
||||
In another terminal, start the first worker:
|
||||
```shell
|
||||
/tmp/nvflare/poc/site-1/startup/start.sh
|
||||
```
|
||||
|
||||
And the second worker:
|
||||
```shell
|
||||
/tmp/nvflare/poc/site-2/startup/start.sh
|
||||
```
|
||||
|
||||
Then start the admin CLI:
|
||||
```shell
|
||||
/tmp/nvflare/poc/admin/startup/fl_admin.sh
|
||||
```
|
||||
|
||||
In the admin CLI, run the following command:
|
||||
```shell
|
||||
submit_job vertical-xgboost
|
||||
```
|
||||
|
||||
Once the training finishes, the model file should be written into
|
||||
`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
|
||||
respectively.
|
||||
|
||||
Finally, shutdown everything from the admin CLI, using `admin` as password:
|
||||
```shell
|
||||
shutdown client
|
||||
shutdown server
|
||||
```
|
||||
|
||||
## Training with GPUs
|
||||
|
||||
Currently GPUs are not yet supported by vertical federated XGBoost.
|
||||
68
demo/nvflare/vertical/custom/controller.py
Normal file
68
demo/nvflare/vertical/custom/controller.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Example of training controller with NVFlare
|
||||
===========================================
|
||||
"""
|
||||
import multiprocessing
|
||||
|
||||
from nvflare.apis.client import Client
|
||||
from nvflare.apis.fl_context import FLContext
|
||||
from nvflare.apis.impl.controller import Controller, Task
|
||||
from nvflare.apis.shareable import Shareable
|
||||
from nvflare.apis.signal import Signal
|
||||
from trainer import SupportedTasks
|
||||
|
||||
import xgboost.federated
|
||||
|
||||
|
||||
class XGBoostController(Controller):
|
||||
def __init__(self, port: int, world_size: int, server_key_path: str,
|
||||
server_cert_path: str, client_cert_path: str):
|
||||
"""Controller for federated XGBoost.
|
||||
|
||||
Args:
|
||||
port: the port for the gRPC server to listen on.
|
||||
world_size: the number of sites.
|
||||
server_key_path: the path to the server key file.
|
||||
server_cert_path: the path to the server certificate file.
|
||||
client_cert_path: the path to the client certificate file.
|
||||
"""
|
||||
super().__init__()
|
||||
self._port = port
|
||||
self._world_size = world_size
|
||||
self._server_key_path = server_key_path
|
||||
self._server_cert_path = server_cert_path
|
||||
self._client_cert_path = client_cert_path
|
||||
self._server = None
|
||||
|
||||
def start_controller(self, fl_ctx: FLContext):
|
||||
self._server = multiprocessing.Process(
|
||||
target=xgboost.federated.run_federated_server,
|
||||
args=(self._port, self._world_size, self._server_key_path,
|
||||
self._server_cert_path, self._client_cert_path))
|
||||
self._server.start()
|
||||
|
||||
def stop_controller(self, fl_ctx: FLContext):
|
||||
if self._server:
|
||||
self._server.terminate()
|
||||
|
||||
def process_result_of_unknown_task(self, client: Client, task_name: str,
|
||||
client_task_id: str, result: Shareable,
|
||||
fl_ctx: FLContext):
|
||||
self.log_warning(fl_ctx, f"Unknown task: {task_name} from client {client.name}.")
|
||||
|
||||
def control_flow(self, abort_signal: Signal, fl_ctx: FLContext):
|
||||
self.log_info(fl_ctx, "XGBoost training control flow started.")
|
||||
if abort_signal.triggered:
|
||||
return
|
||||
task = Task(name=SupportedTasks.TRAIN, data=Shareable())
|
||||
self.broadcast_and_wait(
|
||||
task=task,
|
||||
min_responses=self._world_size,
|
||||
fl_ctx=fl_ctx,
|
||||
wait_time_after_min_received=1,
|
||||
abort_signal=abort_signal,
|
||||
)
|
||||
if abort_signal.triggered:
|
||||
return
|
||||
|
||||
self.log_info(fl_ctx, "XGBoost training control flow finished.")
|
||||
97
demo/nvflare/vertical/custom/trainer.py
Normal file
97
demo/nvflare/vertical/custom/trainer.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import os
|
||||
|
||||
from nvflare.apis.executor import Executor
|
||||
from nvflare.apis.fl_constant import FLContextKey, ReturnCode
|
||||
from nvflare.apis.fl_context import FLContext
|
||||
from nvflare.apis.shareable import Shareable, make_reply
|
||||
from nvflare.apis.signal import Signal
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import callback
|
||||
|
||||
|
||||
class SupportedTasks(object):
|
||||
TRAIN = "train"
|
||||
|
||||
|
||||
class XGBoostTrainer(Executor):
|
||||
def __init__(self, server_address: str, world_size: int, server_cert_path: str,
|
||||
client_key_path: str, client_cert_path: str):
|
||||
"""Trainer for federated XGBoost.
|
||||
|
||||
Args:
|
||||
server_address: address for the gRPC server to connect to.
|
||||
world_size: the number of sites.
|
||||
server_cert_path: the path to the server certificate file.
|
||||
client_key_path: the path to the client key file.
|
||||
client_cert_path: the path to the client certificate file.
|
||||
"""
|
||||
super().__init__()
|
||||
self._server_address = server_address
|
||||
self._world_size = world_size
|
||||
self._server_cert_path = server_cert_path
|
||||
self._client_key_path = client_key_path
|
||||
self._client_cert_path = client_cert_path
|
||||
|
||||
def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
|
||||
abort_signal: Signal) -> Shareable:
|
||||
self.log_info(fl_ctx, f"Executing {task_name}")
|
||||
try:
|
||||
if task_name == SupportedTasks.TRAIN:
|
||||
self._do_training(fl_ctx)
|
||||
return make_reply(ReturnCode.OK)
|
||||
else:
|
||||
self.log_error(fl_ctx, f"{task_name} is not a supported task.")
|
||||
return make_reply(ReturnCode.TASK_UNKNOWN)
|
||||
except BaseException as e:
|
||||
self.log_exception(fl_ctx,
|
||||
f"Task {task_name} failed. Exception: {e.__str__()}")
|
||||
return make_reply(ReturnCode.EXECUTION_EXCEPTION)
|
||||
|
||||
def _do_training(self, fl_ctx: FLContext):
|
||||
client_name = fl_ctx.get_prop(FLContextKey.CLIENT_NAME)
|
||||
rank = int(client_name.split('-')[1]) - 1
|
||||
communicator_env = {
|
||||
'xgboost_communicator': 'federated',
|
||||
'federated_server_address': self._server_address,
|
||||
'federated_world_size': self._world_size,
|
||||
'federated_rank': rank,
|
||||
'federated_server_cert': self._server_cert_path,
|
||||
'federated_client_key': self._client_key_path,
|
||||
'federated_client_cert': self._client_cert_path
|
||||
}
|
||||
with xgb.collective.CommunicatorContext(**communicator_env):
|
||||
# Load file, file will not be sharded in federated mode.
|
||||
if rank == 0:
|
||||
label = '&label_column=0'
|
||||
else:
|
||||
label = ''
|
||||
dtrain = xgb.DMatrix(f'higgs.train.csv?format=csv{label}', data_split_mode=1)
|
||||
dtest = xgb.DMatrix(f'higgs.test.csv?format=csv{label}', data_split_mode=1)
|
||||
|
||||
# specify parameters via map
|
||||
param = {
|
||||
'validate_parameters': True,
|
||||
'eta': 0.1,
|
||||
'gamma': 1.0,
|
||||
'max_depth': 8,
|
||||
'min_child_weight': 100,
|
||||
'tree_method': 'approx',
|
||||
'grow_policy': 'depthwise',
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'auc',
|
||||
}
|
||||
|
||||
# specify validations set to watch performance
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
# number of boosting rounds
|
||||
num_round = 10
|
||||
|
||||
bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2)
|
||||
|
||||
# Save the model.
|
||||
workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
|
||||
run_number = fl_ctx.get_prop(FLContextKey.CURRENT_RUN)
|
||||
run_dir = workspace.get_run_dir(run_number)
|
||||
bst.save_model(os.path.join(run_dir, "higgs.model.federated.vertical.json"))
|
||||
xgb.collective.communicator_print("Finished training\n")
|
||||
65
demo/nvflare/vertical/prepare_data.sh
Executable file
65
demo/nvflare/vertical/prepare_data.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
rm -fr ./*.pem /tmp/nvflare/poc
|
||||
|
||||
world_size=2
|
||||
|
||||
# Generate server and client certificates.
|
||||
openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out server-cert.pem -subj "/C=US/CN=localhost"
|
||||
openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
|
||||
|
||||
# Download HIGGS dataset.
|
||||
if [ -f "HIGGS.csv" ]; then
|
||||
echo "HIGGS.csv exists, skipping download."
|
||||
else
|
||||
echo "Downloading HIGGS dataset."
|
||||
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
|
||||
gunzip HIGGS.csv.gz
|
||||
fi
|
||||
|
||||
# Split into train/test.
|
||||
if [[ -f higgs.train.csv && -f higgs.test.csv ]]; then
|
||||
echo "higgs.train.csv and higgs.test.csv exist, skipping split."
|
||||
else
|
||||
echo "Splitting HIGGS dataset into train/test."
|
||||
head -n 10450000 HIGGS.csv > higgs.train.csv
|
||||
tail -n 550000 HIGGS.csv > higgs.test.csv
|
||||
fi
|
||||
|
||||
# Split train and test files by column to simulate a federated environment.
|
||||
site_files=(higgs.{train,test}.csv-site-*)
|
||||
if [ ${#site_files[@]} -eq $((world_size*2)) ]; then
|
||||
echo "Site files exist, skipping split."
|
||||
else
|
||||
echo "Splitting train/test into site files."
|
||||
total_cols=28 # plus label
|
||||
cols=$((total_cols/world_size))
|
||||
echo "Columns per site: $cols"
|
||||
for (( site=1; site<=world_size; site++ )); do
|
||||
if (( site == 1 )); then
|
||||
start=$((cols*(site-1)+1))
|
||||
else
|
||||
start=$((cols*(site-1)+2))
|
||||
fi
|
||||
if (( site == world_size )); then
|
||||
end=$((total_cols+1))
|
||||
else
|
||||
end=$((cols*site+1))
|
||||
fi
|
||||
echo "Site $site, columns $start-$end"
|
||||
cut -d, -f${start}-${end} higgs.train.csv > higgs.train.csv-site-"${site}"
|
||||
cut -d, -f${start}-${end} higgs.test.csv > higgs.test.csv-site-"${site}"
|
||||
done
|
||||
fi
|
||||
|
||||
nvflare poc -n 2 --prepare
|
||||
mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
|
||||
cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
|
||||
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
|
||||
for (( site=1; site<=world_size; site++ )); do
|
||||
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
|
||||
ln -s "${PWD}"/higgs.train.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.train.csv
|
||||
ln -s "${PWD}"/higgs.test.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.test.csv
|
||||
done
|
||||
Reference in New Issue
Block a user