recover not yet working

This commit is contained in:
tqchen
2014-12-01 16:57:26 -08:00
parent 1c5167d96e
commit 337840d29b
6 changed files with 154 additions and 5 deletions

View File

@@ -52,7 +52,13 @@ class IEngine {
* \param root the root worker id to broadcast the data
*/
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) = 0;
/*!
/*!
* \brief explicitly re-init everything before calling LoadCheckPoint
* call this function when IEngine throw an exception out,
* this function is only used for test purpose
*/
virtual void InitAfterException(void) = 0;
/*!
* \brief load latest check point
* \param p_model pointer to the model
* \return true if there was stored checkpoint and load was successful
@@ -63,7 +69,7 @@ class IEngine {
* \brief checkpoint the model, meaning we finished a stage of execution
* \param p_model pointer to the model
*/
virtual void CheckPoint(const utils::ISerializable &model) = 0;
virtual void CheckPoint(const utils::ISerializable &model) = 0;
/*! \brief get rank of current node */
virtual int GetRank(void) const = 0;
/*! \brief get total number of */

View File

@@ -93,7 +93,15 @@ class AllReduceBase : public IEngine {
*/
virtual void CheckPoint(const utils::ISerializable &model) {
}
/*!
* \brief explicitly re-init everything before calling LoadCheckPoint
* call this function when IEngine throw an exception out,
* this function is only used for test purpose
*/
virtual void InitAfterException(void) {
utils::Error("InitAfterException: not implemented");
}
protected:
/*! \brief enumeration of possible returning results from Try functions */
enum ReturnType {

View File

@@ -51,6 +51,14 @@ class AllReduceRobust : public AllReduceBase {
* \param p_model pointer to the model
*/
virtual void CheckPoint(const utils::ISerializable &model);
/*!
* \brief explicitly re-init everything before calling LoadCheckPoint
* call this function when IEngine throw an exception out,
* this function is only used for test purpose
*/
virtual void InitAfterException(void) {
this->CheckAndRecover(kGetExcept);
}
private:
// constant one byte out of band message to indicate error happening