Fix nvflare horizontal demo (#9124)

This commit is contained in:
Rong Ou 2023-05-05 01:48:22 -07:00 committed by GitHub
parent 47b3cb6fb7
commit 250b22dd22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 40 additions and 11 deletions

View File

@ -43,9 +43,38 @@ In the admin CLI, run the following command:
submit_job horizontal-xgboost
```
Make a note of the job id:
```console
Submitted job: 28309e77-a7c5-45e6-b2bc-c2e3655122d8
```
On both workers, you should see train and eval losses printed:
```console
[10:45:41] [0] eval-logloss:0.22646 train-logloss:0.23316
[10:45:41] [1] eval-logloss:0.13776 train-logloss:0.13654
[10:45:41] [2] eval-logloss:0.08036 train-logloss:0.08243
[10:45:41] [3] eval-logloss:0.05830 train-logloss:0.05645
[10:45:41] [4] eval-logloss:0.03825 train-logloss:0.04148
[10:45:41] [5] eval-logloss:0.02660 train-logloss:0.02958
[10:45:41] [6] eval-logloss:0.01386 train-logloss:0.01918
[10:45:41] [7] eval-logloss:0.01018 train-logloss:0.01331
[10:45:41] [8] eval-logloss:0.00847 train-logloss:0.01112
[10:45:41] [9] eval-logloss:0.00691 train-logloss:0.00662
[10:45:41] [10] eval-logloss:0.00543 train-logloss:0.00503
[10:45:41] [11] eval-logloss:0.00445 train-logloss:0.00420
[10:45:41] [12] eval-logloss:0.00336 train-logloss:0.00355
[10:45:41] [13] eval-logloss:0.00277 train-logloss:0.00280
[10:45:41] [14] eval-logloss:0.00252 train-logloss:0.00244
[10:45:41] [15] eval-logloss:0.00177 train-logloss:0.00193
[10:45:41] [16] eval-logloss:0.00156 train-logloss:0.00161
[10:45:41] [17] eval-logloss:0.00135 train-logloss:0.00142
[10:45:41] [18] eval-logloss:0.00123 train-logloss:0.00125
[10:45:41] [19] eval-logloss:0.00106 train-logloss:0.00107
```
Once the training finishes, the model file should be written into
`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
respectively.
`/tmp/nvlfare/poc/site-1/${job_id}/test.model.json` and `/tmp/nvflare/poc/site-2/${job_id}/test.model.json`
respectively, where `job_id` is the UUID printed out when we ran `submit_job`.
Finally, shutdown everything from the admin CLI, using `admin` as password:
```shell

View File

@ -63,8 +63,8 @@ class XGBoostTrainer(Executor):
}
with xgb.collective.CommunicatorContext(**communicator_env):
# Load file, file will not be sharded in federated mode.
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
dtrain = xgb.DMatrix('agaricus.txt.train?format=libsvm')
dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')
# Specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}

View File

@ -2,7 +2,7 @@
set -e
rm -fr ./agaricus* ./*.pem ./poc
rm -fr ./agaricus* ./*.pem /tmp/nvflare
world_size=2
@ -11,15 +11,15 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
# Split train and test files manually to simulate a federated environment.
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train agaricus.txt.train-site-
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.train agaricus.txt.train-site-
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test agaricus.txt.test-site-
nvflare poc -n 2 --prepare
mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
for id in $(eval echo "{1..$world_size}"); do
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
cp agaricus.txt.train-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.train
cp agaricus.txt.test-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.test
for (( site=1; site<=world_size; site++ )); do
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
cp agaricus.txt.train-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.train
cp agaricus.txt.test-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.test
done