From 7765e2dc553a2432244272b2bc125769218d92dc Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 5 Dec 2014 09:49:26 -0800 Subject: [PATCH] add status report --- src/allreduce_base.h | 14 ++++++++++++++ src/allreduce_robust.cc | 1 + src/allreduce_robust.h | 3 --- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/allreduce_base.h b/src/allreduce_base.h index 8ebd3ad3a..e972551f8 100644 --- a/src/allreduce_base.h +++ b/src/allreduce_base.h @@ -144,6 +144,16 @@ class AllreduceBase : public IEngine { virtual void InitAfterException(void) { utils::Error("InitAfterException: not implemented"); } + /*! + * \brief report current status to the job tracker + * depending on the job tracker we are in + */ + inline void ReportStatus(void) const { + if (hadoop_mode != 0) { + fprintf(stderr, "reporter:status:Rabit Phase[%03d] Operation %03d\n", + version_number, seq_counter); + } + } protected: /*! \brief enumeration of possible returning results from Try functions */ @@ -284,6 +294,10 @@ class AllreduceBase : public IEngine { */ ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root); //---- data structure related to model ---- + // call sequence counter, records how many calls we made so far + // from last call to CheckPoint, LoadCheckPoint + int seq_counter; + // version number of model int version_number; // whether the job is running in hadoop int hadoop_mode; diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index 3232ef40c..0f30ae1aa 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -645,6 +645,7 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) { // request ActionSummary req(flag, seqno); while (true) { + this->ReportStatus(); // action ActionSummary act = req; // get the reduced action diff --git a/src/allreduce_robust.h b/src/allreduce_robust.h index 45820a017..2f183ef13 100644 --- a/src/allreduce_robust.h +++ b/src/allreduce_robust.h @@ -402,9 +402,6 @@ class AllreduceRobust : public AllreduceBase { size_t out_index) ); //---- recovery data structure ---- - // call sequence counter, records how many calls we made so far - // from last call to CheckPoint, LoadCheckPoint - int seq_counter; // the round of result buffer, used to mode the result int result_buffer_round; // result buffer of all reduce