diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh index f61ef2152..b08e7d4e6 100755 --- a/multi-node/row-split/machine-row-rabit-mock.sh +++ b/multi-node/row-split/machine-row-rabit-mock.sh @@ -17,4 +17,4 @@ cd - python splitrows.py ../../demo/regression/machine.txt.train train-machine $k # run xgboost mpi -../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 +../../rabit/tracker/rabit_mpi.py -n $k ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 diff --git a/multi-node/row-split/machine-row-rabit.sh b/multi-node/row-split/machine-row-rabit.sh index 4a526ff94..69f94b9d1 100755 --- a/multi-node/row-split/machine-row-rabit.sh +++ b/multi-node/row-split/machine-row-rabit.sh @@ -17,8 +17,8 @@ cd - python splitrows.py ../../demo/regression/machine.txt.train train-machine $k # run xgboost mpi -../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=3 +../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1 # run xgboost-mpi save model 0001, continue to run from existing model -../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=1 -../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model +../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1 +../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index fb0b8953d..7334ad689 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -11,6 +11,8 @@ #include #include #include +// rabit library for synchronization +#include #include "./evaluation.h" #include "./helper_utils.h" @@ -23,7 +25,8 @@ namespace learner { template struct EvalEWiseBase : public IEvaluator { virtual float Eval(const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed) const { utils::Check(info.labels.size() != 0, "label set cannot be empty"); utils::Check(preds.size() % info.labels.size() == 0, "label and prediction size not match"); @@ -37,7 +40,11 @@ struct EvalEWiseBase : public IEvaluator { sum += Derived::EvalRow(info.labels[i], preds[i]) * wt; wsum += wt; } - return Derived::GetFinal(sum, wsum); + float dat[2]; dat[0] = sum, dat[1] = wsum; + if (distributed) { + rabit::Allreduce(dat, 2); + } + return Derived::GetFinal(dat[0], dat[1]); } /*! * \brief to be implemented by subclass, @@ -113,7 +120,9 @@ struct EvalCTest: public IEvaluator { return name_.c_str(); } virtual float Eval(const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed) const { + utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str()); utils::Check(preds.size() % info.labels.size() == 0, "label and prediction size not match"); size_t ngroup = preds.size() / info.labels.size() - 1; @@ -150,7 +159,9 @@ struct EvalAMS : public IEvaluator { utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format"); } virtual float Eval(const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed) const { + utils::Check(!distributed, "metric AMS do not support distributed evaluation"); using namespace std; const bst_omp_uint ndata = static_cast(info.labels.size()); @@ -212,7 +223,9 @@ struct EvalPrecisionRatio : public IEvaluator{ } } virtual float Eval(const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed) const { + utils::Check(!distributed, "metric %s do not support distributed evaluation", Name()); utils::Check(info.labels.size() != 0, "label set cannot be empty"); utils::Assert(preds.size() % info.labels.size() == 0, "label size predict size not match"); @@ -252,7 +265,8 @@ struct EvalPrecisionRatio : public IEvaluator{ /*! \brief Area under curve, for both classification and rank */ struct EvalAuc : public IEvaluator { virtual float Eval(const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed) const { utils::Check(info.labels.size() != 0, "label set cannot be empty"); utils::Check(preds.size() % info.labels.size() == 0, "label size predict size not match"); @@ -299,8 +313,14 @@ struct EvalAuc : public IEvaluator { sum_auc += sum_pospair / (sum_npos*sum_nneg); } } - // return average AUC over list - return static_cast(sum_auc) / ngroup; + if (distributed) { + float dat[2]; dat[0] = sum_auc; dat[1] = ngroup; + // approximately estimate auc using mean + rabit::Allreduce(dat, 2); + return dat[0] / dat[1]; + } else { + return static_cast(sum_auc) / ngroup; + } } virtual const char *Name(void) const { return "auc"; @@ -311,7 +331,8 @@ struct EvalAuc : public IEvaluator { struct EvalRankList : public IEvaluator { public: virtual float Eval(const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed) const { utils::Check(preds.size() == info.labels.size(), "label size predict size not match"); // quick consistency when group is not available @@ -336,7 +357,14 @@ struct EvalRankList : public IEvaluator { sum_metric += this->EvalMetric(rec); } } - return static_cast(sum_metric) / ngroup; + if (distributed) { + float dat[2]; dat[0] = sum_metric; dat[1] = ngroup; + // approximately estimate auc using mean + rabit::Allreduce(dat, 2); + return dat[0] / dat[1]; + } else { + return static_cast(sum_metric) / ngroup; + } } virtual const char *Name(void) const { return name_.c_str(); diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h index 33370e706..4d59e270a 100644 --- a/src/learner/evaluation.h +++ b/src/learner/evaluation.h @@ -19,9 +19,13 @@ struct IEvaluator{ * \brief evaluate a specific metric * \param preds prediction * \param info information, including label etc. + * \param distributed whether a call to Allreduce is needed to gather + * the average statistics across all the node, + * this is only supported by some metrics */ virtual float Eval(const std::vector &preds, - const MetaInfo &info) const = 0; + const MetaInfo &info, + bool distributed = false) const = 0; /*! \return name of metric */ virtual const char *Name(void) const = 0; /*! \brief virtual destructor */ @@ -70,10 +74,11 @@ class EvalSet{ } inline std::string Eval(const char *evname, const std::vector &preds, - const MetaInfo &info) const { + const MetaInfo &info, + bool distributed = false) { std::string result = ""; for (size_t i = 0; i < evals_.size(); ++i) { - float res = evals_[i]->Eval(preds, info); + float res = evals_[i]->Eval(preds, info, distributed); char tmp[1024]; utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res); result += tmp; diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index cb02ee075..ae0967ce8 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -287,7 +287,7 @@ class BoostLearner : public rabit::ISerializable { for (size_t i = 0; i < evals.size(); ++i) { this->PredictRaw(*evals[i], &preds_); obj_->EvalTransform(&preds_); - res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info); + res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2); } return res; } diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index 7816fbfd2..52ce4e58b 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -181,8 +181,14 @@ class BoostLearnTask { if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); learner.UpdateOneIter(i, *data); std::string res = learner.EvalOneIter(i, devalall, eval_data_names); - if (silent < 2) { - fprintf(stderr, "%s\n", res.c_str()); + if (rabit::IsDistributed()){ + if (rabit::GetRank() == 0) { + rabit::TrackerPrintf("%s\n", res.c_str()); + } + } else { + if (silent < 2) { + fprintf(stderr, "%s\n", res.c_str()); + } } if (save_period != 0 && (i + 1) % save_period == 0) { this->SaveModel(i);