add status report
This commit is contained in:
parent
ab278513ab
commit
7765e2dc55
@ -144,6 +144,16 @@ class AllreduceBase : public IEngine {
|
|||||||
virtual void InitAfterException(void) {
|
virtual void InitAfterException(void) {
|
||||||
utils::Error("InitAfterException: not implemented");
|
utils::Error("InitAfterException: not implemented");
|
||||||
}
|
}
|
||||||
|
/*!
|
||||||
|
* \brief report current status to the job tracker
|
||||||
|
* depending on the job tracker we are in
|
||||||
|
*/
|
||||||
|
inline void ReportStatus(void) const {
|
||||||
|
if (hadoop_mode != 0) {
|
||||||
|
fprintf(stderr, "reporter:status:Rabit Phase[%03d] Operation %03d\n",
|
||||||
|
version_number, seq_counter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
/*! \brief enumeration of possible returning results from Try functions */
|
/*! \brief enumeration of possible returning results from Try functions */
|
||||||
@ -284,6 +294,10 @@ class AllreduceBase : public IEngine {
|
|||||||
*/
|
*/
|
||||||
ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
|
ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
|
||||||
//---- data structure related to model ----
|
//---- data structure related to model ----
|
||||||
|
// call sequence counter, records how many calls we made so far
|
||||||
|
// from last call to CheckPoint, LoadCheckPoint
|
||||||
|
int seq_counter;
|
||||||
|
// version number of model
|
||||||
int version_number;
|
int version_number;
|
||||||
// whether the job is running in hadoop
|
// whether the job is running in hadoop
|
||||||
int hadoop_mode;
|
int hadoop_mode;
|
||||||
|
|||||||
@ -645,6 +645,7 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
|
|||||||
// request
|
// request
|
||||||
ActionSummary req(flag, seqno);
|
ActionSummary req(flag, seqno);
|
||||||
while (true) {
|
while (true) {
|
||||||
|
this->ReportStatus();
|
||||||
// action
|
// action
|
||||||
ActionSummary act = req;
|
ActionSummary act = req;
|
||||||
// get the reduced action
|
// get the reduced action
|
||||||
|
|||||||
@ -402,9 +402,6 @@ class AllreduceRobust : public AllreduceBase {
|
|||||||
size_t out_index)
|
size_t out_index)
|
||||||
);
|
);
|
||||||
//---- recovery data structure ----
|
//---- recovery data structure ----
|
||||||
// call sequence counter, records how many calls we made so far
|
|
||||||
// from last call to CheckPoint, LoadCheckPoint
|
|
||||||
int seq_counter;
|
|
||||||
// the round of result buffer, used to mode the result
|
// the round of result buffer, used to mode the result
|
||||||
int result_buffer_round;
|
int result_buffer_round;
|
||||||
// result buffer of all reduce
|
// result buffer of all reduce
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user