recover not yet working
This commit is contained in:
10
src/engine.h
10
src/engine.h
@@ -52,7 +52,13 @@ class IEngine {
|
||||
* \param root the root worker id to broadcast the data
|
||||
*/
|
||||
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) = 0;
|
||||
/*!
|
||||
/*!
|
||||
* \brief explicitly re-init everything before calling LoadCheckPoint
|
||||
* call this function when IEngine throw an exception out,
|
||||
* this function is only used for test purpose
|
||||
*/
|
||||
virtual void InitAfterException(void) = 0;
|
||||
/*!
|
||||
* \brief load latest check point
|
||||
* \param p_model pointer to the model
|
||||
* \return true if there was stored checkpoint and load was successful
|
||||
@@ -63,7 +69,7 @@ class IEngine {
|
||||
* \brief checkpoint the model, meaning we finished a stage of execution
|
||||
* \param p_model pointer to the model
|
||||
*/
|
||||
virtual void CheckPoint(const utils::ISerializable &model) = 0;
|
||||
virtual void CheckPoint(const utils::ISerializable &model) = 0;
|
||||
/*! \brief get rank of current node */
|
||||
virtual int GetRank(void) const = 0;
|
||||
/*! \brief get total number of */
|
||||
|
||||
@@ -93,7 +93,15 @@ class AllReduceBase : public IEngine {
|
||||
*/
|
||||
virtual void CheckPoint(const utils::ISerializable &model) {
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief explicitly re-init everything before calling LoadCheckPoint
|
||||
* call this function when IEngine throw an exception out,
|
||||
* this function is only used for test purpose
|
||||
*/
|
||||
virtual void InitAfterException(void) {
|
||||
utils::Error("InitAfterException: not implemented");
|
||||
}
|
||||
|
||||
protected:
|
||||
/*! \brief enumeration of possible returning results from Try functions */
|
||||
enum ReturnType {
|
||||
|
||||
@@ -51,6 +51,14 @@ class AllReduceRobust : public AllReduceBase {
|
||||
* \param p_model pointer to the model
|
||||
*/
|
||||
virtual void CheckPoint(const utils::ISerializable &model);
|
||||
/*!
|
||||
* \brief explicitly re-init everything before calling LoadCheckPoint
|
||||
* call this function when IEngine throw an exception out,
|
||||
* this function is only used for test purpose
|
||||
*/
|
||||
virtual void InitAfterException(void) {
|
||||
this->CheckAndRecover(kGetExcept);
|
||||
}
|
||||
|
||||
private:
|
||||
// constant one byte out of band message to indicate error happening
|
||||
|
||||
Reference in New Issue
Block a user