Fix nvflare horizontal demo (#9124)
This commit is contained in:
parent
47b3cb6fb7
commit
250b22dd22
@ -43,9 +43,38 @@ In the admin CLI, run the following command:
|
||||
submit_job horizontal-xgboost
|
||||
```
|
||||
|
||||
Make a note of the job id:
|
||||
```console
|
||||
Submitted job: 28309e77-a7c5-45e6-b2bc-c2e3655122d8
|
||||
```
|
||||
|
||||
On both workers, you should see train and eval losses printed:
|
||||
```console
|
||||
[10:45:41] [0] eval-logloss:0.22646 train-logloss:0.23316
|
||||
[10:45:41] [1] eval-logloss:0.13776 train-logloss:0.13654
|
||||
[10:45:41] [2] eval-logloss:0.08036 train-logloss:0.08243
|
||||
[10:45:41] [3] eval-logloss:0.05830 train-logloss:0.05645
|
||||
[10:45:41] [4] eval-logloss:0.03825 train-logloss:0.04148
|
||||
[10:45:41] [5] eval-logloss:0.02660 train-logloss:0.02958
|
||||
[10:45:41] [6] eval-logloss:0.01386 train-logloss:0.01918
|
||||
[10:45:41] [7] eval-logloss:0.01018 train-logloss:0.01331
|
||||
[10:45:41] [8] eval-logloss:0.00847 train-logloss:0.01112
|
||||
[10:45:41] [9] eval-logloss:0.00691 train-logloss:0.00662
|
||||
[10:45:41] [10] eval-logloss:0.00543 train-logloss:0.00503
|
||||
[10:45:41] [11] eval-logloss:0.00445 train-logloss:0.00420
|
||||
[10:45:41] [12] eval-logloss:0.00336 train-logloss:0.00355
|
||||
[10:45:41] [13] eval-logloss:0.00277 train-logloss:0.00280
|
||||
[10:45:41] [14] eval-logloss:0.00252 train-logloss:0.00244
|
||||
[10:45:41] [15] eval-logloss:0.00177 train-logloss:0.00193
|
||||
[10:45:41] [16] eval-logloss:0.00156 train-logloss:0.00161
|
||||
[10:45:41] [17] eval-logloss:0.00135 train-logloss:0.00142
|
||||
[10:45:41] [18] eval-logloss:0.00123 train-logloss:0.00125
|
||||
[10:45:41] [19] eval-logloss:0.00106 train-logloss:0.00107
|
||||
```
|
||||
|
||||
Once the training finishes, the model file should be written into
|
||||
`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
|
||||
respectively.
|
||||
`/tmp/nvlfare/poc/site-1/${job_id}/test.model.json` and `/tmp/nvflare/poc/site-2/${job_id}/test.model.json`
|
||||
respectively, where `job_id` is the UUID printed out when we ran `submit_job`.
|
||||
|
||||
Finally, shutdown everything from the admin CLI, using `admin` as password:
|
||||
```shell
|
||||
|
||||
@ -63,8 +63,8 @@ class XGBoostTrainer(Executor):
|
||||
}
|
||||
with xgb.collective.CommunicatorContext(**communicator_env):
|
||||
# Load file, file will not be sharded in federated mode.
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train?format=libsvm')
|
||||
dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')
|
||||
|
||||
# Specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
set -e
|
||||
|
||||
rm -fr ./agaricus* ./*.pem ./poc
|
||||
rm -fr ./agaricus* ./*.pem /tmp/nvflare
|
||||
|
||||
world_size=2
|
||||
|
||||
@ -11,15 +11,15 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
|
||||
openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
|
||||
|
||||
# Split train and test files manually to simulate a federated environment.
|
||||
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train agaricus.txt.train-site-
|
||||
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-
|
||||
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.train agaricus.txt.train-site-
|
||||
split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test agaricus.txt.test-site-
|
||||
|
||||
nvflare poc -n 2 --prepare
|
||||
mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
|
||||
cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
|
||||
for id in $(eval echo "{1..$world_size}"); do
|
||||
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
|
||||
cp agaricus.txt.train-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.train
|
||||
cp agaricus.txt.test-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.test
|
||||
for (( site=1; site<=world_size; site++ )); do
|
||||
cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
|
||||
cp agaricus.txt.train-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.train
|
||||
cp agaricus.txt.test-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.test
|
||||
done
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user