diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md index 9a9b8dd24..cf6622b53 100644 --- a/multi-node/col-split/README.md +++ b/multi-node/col-split/README.md @@ -1,6 +1,7 @@ Distributed XGBoost: Column Split Version ==== * run ```bash mushroom-col.sh ``` + - mushroom-col.sh starts xgboost-mpi job * run ```bash mushroom-col-tcp.sh ``` - mushroom-col-tcp.sh starts xgboost job using xgboost's buildin allreduce * run ```bash mushroom-col-python.sh ``` diff --git a/multi-node/col-split/mushroom-col-tcp.sh b/multi-node/col-split/mushroom-col-tcp.sh new file mode 100755 index 000000000..7257f9890 --- /dev/null +++ b/multi-node/col-split/mushroom-col-tcp.sh @@ -0,0 +1,28 @@ +#!/bin/bash +if [[ $# -ne 1 ]] +then + echo "Usage: nprocess" + exit -1 +fi + +# +# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi +# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py +# +rm -rf train.col* *.model +k=$1 + +# split the lib svm file into k subfiles +python splitsvm.py ../../demo/data/agaricus.txt.train train $k + +# run xgboost mpi +../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col + +# the model can be directly loaded by single machine xgboost solver, as usuall +../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt + +# run for one round, and continue training +../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col num_round=1 +../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col model_in=0001.model + +cat dump.nice.$k.txt \ No newline at end of file diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md index 4c427f3ec..807b0608d 100644 --- a/multi-node/row-split/README.md +++ b/multi-node/row-split/README.md @@ -3,6 +3,8 @@ Distributed XGBoost: Row Split Version * Mushroom: run ```bash mushroom-row.sh ``` * Machine: run ```bash machine-row.sh ``` - Machine case also include example to continue training from existing model +* Machine TCP: run ```bash machine-row-tcp.sh ``` + - machine-col-tcp.sh starts xgboost job using xgboost's buildin allreduce How to Use ==== diff --git a/multi-node/row-split/machine-row-tcp.sh b/multi-node/row-split/machine-row-tcp.sh new file mode 100755 index 000000000..c312eb3a5 --- /dev/null +++ b/multi-node/row-split/machine-row-tcp.sh @@ -0,0 +1,24 @@ +#!/bin/bash +if [[ $# -ne 1 ]] +then + echo "Usage: nprocess" + exit -1 +fi + +rm -rf train-machine.row* *.model +k=$1 +# make machine data +cd ../../demo/regression/ +python mapfeat.py +python mknfold.py machine.txt 1 +cd - + +# split the lib svm file into k subfiles +python splitrows.py ../../demo/regression/machine.txt.train train-machine $k + +# run xgboost mpi +../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=3 + +# run xgboost-mpi save model 0001, continue to run from existing model +../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=1 +../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model