check in the recover strategy

2014-11-30 11:42:59 -08:00 · 2014-11-30 11:42:59 -08:00 · 2e536eda29
commit 2e536eda29
parent 155ed3a814
6 changed files with 235 additions and 31 deletions
--- a/src/allreduce.h
+++ b/src/allreduce.h
@ -2,7 +2,9 @@
 #define ALLREDUCE_H
 /*!
 * \file allreduce.h
- * \brief This file defines a template wrapper of engine to ensure 
+ * \brief This file defines a template wrapper of engine to give more flexible
 *      AllReduce operations
 *
 * \author Tianqi Chen, Nacho, Tianyi
 */
 #include "./engine.h"
--- a/src/engine.h
+++ b/src/engine.h
@ -1,6 +1,6 @@
 /*!
 * \file engine.h
- * \brief This file defines the interface of allreduce library
+ * \brief This file defines the core interface of allreduce library
 * \author Tianqi Chen, Nacho, Tianyi
 */
 #ifndef ALLREDUCE_ENGINE_H
--- a/src/engine_base.cc
+++ b/src/engine_base.cc
@ -1,3 +1,8 @@
 /*!
 * \file engine_base.cc
 * \brief Basic implementation of AllReduce
 * \author Tianqi, Nacho, Tianyi
 */
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
 #define NOMINMAX
@ -137,10 +142,8 @@ void AllReduceBase::SetParam(const char *name, const char *val) {
 * \param type_nbytes the unit number of bytes the type have
 * \param count number of elements to be reduced
 * \param reducer reduce function
- * \return this function can return
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
- *         - kSuccess: allreduce is success,
+ * \sa ReturnType
 *         - kSockError: a neighbor node go down, the connection is dropped
 *         - kGetExcept: another node which is not my neighbor go down, get Out-of-Band exception notification from my neighbor
 */
 AllReduceBase::ReturnType
 AllReduceBase::TryAllReduce(void *sendrecvbuf_,
@ -278,7 +281,8 @@ AllReduceBase::TryAllReduce(void *sendrecvbuf_,
 * \param sendrecvbuf_ buffer for both sending and recving data
 * \param total_size the size of the data to be broadcasted
 * \param root the root worker id to broadcast the data
- * \return this function can return three possible values, see detail in TryAllReduce
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
 * \sa ReturnType
 */
 AllReduceBase::ReturnType
 AllReduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
--- a/src/engine_base.h
+++ b/src/engine_base.h
@ -97,8 +97,14 @@ class AllReduceBase : public IEngine {
 protected:
  /*! \brief enumeration of possible returning results from Try functions */
  enum ReturnType {
    /*! \brief execution is successful */
    kSuccess,
    /*! \brief a neighbor node go down, the connection is dropped */
    kSockError,
    /*! 
     * \brief another node which is not my neighbor go down,
     *   get Out-of-Band exception notification from my neighbor
     */
    kGetExcept
  };
  // link record to a neighbor
@ -202,10 +208,8 @@ class AllReduceBase : public IEngine {
   * \param type_nbytes the unit number of bytes the type have
   * \param count number of elements to be reduced
   * \param reducer reduce function
-   * \return this function can return
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
-   *         - kSuccess: allreduce is success,
+   * \sa ReturnType
   *         - kSockError: a neighbor node go down, the connection is dropped
   *         - kGetExcept: another node which is not my neighbor go down, get Out-of-Band exception notification from my neighbor
   */
  ReturnType TryAllReduce(void *sendrecvbuf_,
                          size_t type_nbytes,
@ -216,7 +220,8 @@ class AllReduceBase : public IEngine {
   * \param sendrecvbuf_ buffer for both sending and recving data
   * \param size the size of the data to be broadcasted
   * \param root the root worker id to broadcast the data
-   * \return this function can return three possible values, see detail in TryAllReduce
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
  //---- local data related to link ----
--- a/src/engine_robust.cc
+++ b/src/engine_robust.cc
@ -1,3 +1,8 @@
 /*!
 * \file engine_robust.cc
 * \brief Robust implementation of AllReduce
 * \author Tianqi, Nacho, Tianyi
 */
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
 #define NOMINMAX
@ -72,7 +77,6 @@ AllReduceRobust::ReturnType AllReduceRobust::TryResetLinks(void) {
    links[i].InitBuffer(sizeof(int), 1 << 10, reduce_buffer_size);
    links[i].ResetSize();
  }  
  // read and discard data from all channels until pass mark
  while (true) {
    for (int i = 0; i < nlink; ++i) {
@ -179,12 +183,150 @@ AllReduceRobust::ReturnType AllReduceRobust::TryResetLinks(void) {
  }
  return kSuccess;
 }
-
+/*!
-bool AllReduceRobust::RecoverExec(void *sendrecvbuf_, size_t size, int flag, int seqno) {
+ * \brief try to reconnect the broken links
 * \return this function can kSuccess or kSockError
 */
 AllReduceRobust::ReturnType AllReduceRobust::TryReConnectLinks(void) {
  utils::Error("TryReConnectLinks: not implemented");
  return kSuccess;
 }
 /*!
 * \brief if err_type indicates an error
 *         recover links according to the error type reported
 *        if there is no error, return true
 * \param err_type the type of error happening in the system
 * \return true if err_type is kSuccess, false otherwise 
 */
 bool AllReduceRobust::CheckAndRecover(ReturnType err_type) {
  if (err_type == kSuccess) return true;
  while(err_type != kSuccess) {
    switch(err_type) {
      case kGetExcept: err_type = TryResetLinks(); break;
      case kSockError: {
        TryResetLinks();
        err_type = TryReConnectLinks();
        break;
      }
      default: utils::Assert(false, "RecoverLinks: cannot reach here");
    }
  }
  return false;
 }
 /*!
 * \brief try to load check point
 *        
 *        This is a collaborative function called by all nodes
 *        only the nodes with requester set to true really needs to load the check point
 *        other nodes acts as collaborative roles to complete this request
 *
 * \param requester whether current node is the requester
 * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
 * \sa ReturnType
 */
 AllReduceRobust::ReturnType AllReduceRobust::TryLoadCheckPoint(bool requester) {
  utils::Error("TryLoadCheckPoint: not implemented");
  return kSuccess;
 }
 /*!
 * \brief try to get the result of operation specified by seqno
 *
 *        This is a collaborative function called by all nodes
 *        only the nodes with requester set to true really needs to get the result
 *        other nodes acts as collaborative roles to complete this request
 *
 * \param buf the buffer to store the result, this parameter is only use when current node is requester
 * \param size the total size of the buffer, this parameter is only use when current node is requester
 * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
 * \param requester whether current node is the requester
 * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
 * \sa ReturnType
 */
 AllReduceRobust::ReturnType AllReduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool requester) {
  utils::Error("TryGetResult: not implemented");
  return kSuccess;
 }
 /*!
 * \brief try to run recover execution for a request action described by flag and seqno,
 *        the function will keep blocking to run possible recovery operations before the specified action,
 *        until the requested result is received by a recovering procedure,
 *        or the function discovers that the requested action is not yet executed, and return false
 *
 * \param buf the buffer to store the result
 * \param size the total size of the buffer
 * \param flag flag information about the action \sa ActionSummary
 * \param seqno sequence number of the action, if it is special action with flag set, seqno needs to be set to ActionSummary::kMaxSeq
 *
 * \return if this function can return true or false 
 *    - true means buf already set to the
 *           result by recovering procedure, the action is complete, no further action is needed
 *    - false means this is the lastest action that has not yet been executed, need to execute the action
 */
 bool AllReduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
  if (flag != 0) {
    utils::Assert(seqno == ActionSummary::kMaxSeq, "must only set seqno for normal operations");
  }
-  ActionSummary act(flag, seqno);
+  // request
  ActionSummary req(flag, seqno);  
  while (true) {
    // action
    ActionSummary act = req;    
    // get the reduced action
    if (!CheckAndRecover(TryAllReduce(&act, sizeof(act), 1, ActionSummary::Reducer))) continue;
    if (act.check_ack()) {
      if (act.check_point()) {
        // if we also have check_point, do check point first
        utils::Assert(!act.diff_seq(),
                      "check ack & check pt  cannot occur together with normal ops");
        // if we requested checkpoint, we are free to go
        if (req.check_point()) return true;
      } else if (act.load_check()) {
        // if there is only check_ack and load_check, do load_check
        if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
        // if requested load check, then misson complete
        if (req.load_check()) return true;
      } else {
        // there is no check point and no load check, execute check ack
        if (req.check_ack()) return true;
      }
      // if execute to this point
      // this means the action requested has not been completed
      // try next round
    } else {
      if (act.check_point()) {
        if (act.diff_seq()) {
          utils::Assert(act.min_seqno() != ActionSummary::kMaxSeq, "min seq bug");
          bool requester = req.min_seqno() == act.min_seqno();
          if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
          if (requester) return true;
        } else  {
          // no difference in seq no, means we are free to check point
          if (req.check_point()) return true;
        }
      } else {
        // no check point
        if (act.load_check()) {
          // load check have higher priority, do load_check
          if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
          // if requested load check, then misson complete
          if (req.load_check()) return true;          
        } else {
          // no special flags, no checkpoint, check ack, load_check
          utils::Assert(act.min_seqno() != ActionSummary::kMaxSeq, "min seq bug");
          if (act.diff_seq()) {
            bool requester = req.min_seqno() == act.min_seqno();
            if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
            if (requester) return true;
          } else {
            // all the request is same, this is most recent command that is yet to be executed
            return false;
          }
        }
      }
      // something is still incomplete try next round
    }
  }
  utils::Assert(false, "RecoverExec: should not reach here");
  return true;
 }
 }  // namespace engine
--- a/src/engine_robust.h
+++ b/src/engine_robust.h
@ -89,15 +89,19 @@ class AllReduceRobust : public AllReduceBase {
    inline int min_seqno(void) const {
      return seqcode >> 4;
    }
    // whether the operation set contains a load_check
    inline bool load_check(void) const {
      return (seqcode & kLoadCheck) != 0;
    }
    // whether the operation set contains a check point
    inline bool check_point(void) const {
      return (seqcode & kCheckPoint) != 0;
    }
-    // whether the operation set contains a check point
+    // whether the operation set contains a check ack
    inline bool check_ack(void) const {
      return (seqcode & kCheckAck) != 0;
    }
-    // whether the operation set contains a check point
+    // whether the operation set contains different sequence number
    inline bool diff_seq(void) const {
      return (seqcode & kDiffSeq) != 0;
    }
@ -186,15 +190,62 @@ class AllReduceRobust : public AllReduceBase {
   */
  ReturnType TryResetLinks(void);  
  /*!
-   * \brief Run recovery execution of a action specified by flag and seqno,
+   * \brief try to reconnect the broken links
-   *        there can be two outcome of the function
+   * \return this function can kSuccess or kSockError
   *       
   * \param sendrecvbuf_
   *
   * \return if this function returns true, this means
   *        behind and we will be able to recover data from existing node 
   */
-  bool RecoverExec(void *sendrecvbuf_, size_t size, int flag, int seqno);
+  ReturnType TryReConnectLinks(void);
  /*!
   * \brief if err_type indicates an error
   *         recover links according to the error type reported
   *        if there is no error, return true
   * \param err_type the type of error happening in the system
   * \return true if err_type is kSuccess, false otherwise 
   */
  bool CheckAndRecover(ReturnType err_type);
  /*!
   * \brief try to run recover execution for a request action described by flag and seqno,
   *        the function will keep blocking to run possible recovery operations before the specified action,
   *        until the requested result is received by a recovering procedure,
   *        or the function discovers that the requested action is not yet executed, and return false
   *
   * \param buf the buffer to store the result
   * \param size the total size of the buffer
   * \param flag flag information about the action \sa ActionSummary
   * \param seqno sequence number of the action, if it is special action with flag set, seqno needs to be set to ActionSummary::kMaxSeq
   *
   * \return if this function can return true or false 
 *    - true means buf already set to the
 *           result by recovering procedure, the action is complete, no further action is needed
   *    - false means this is the lastest action that has not yet been executed, need to execute the action
   */
  bool RecoverExec(void *buf, size_t size, int flag, int seqno = ActionSummary::kMaxSeq);  
  /*!
   * \brief try to load check point
   *        
   *        This is a collaborative function called by all nodes
   *        only the nodes with requester set to true really needs to load the check point
   *        other nodes acts as collaborative roles to complete this request
   *
   * \param requester whether current node is the requester
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryLoadCheckPoint(bool requester);
  /*!
   * \brief try to get the result of operation specified by seqno
   *
   *        This is a collaborative function called by all nodes
   *        only the nodes with requester set to true really needs to get the result
   *        other nodes acts as collaborative roles to complete this request
   *
   * \param buf the buffer to store the result, this parameter is only use when current node is requester
   * \param size the total size of the buffer, this parameter is only use when current node is requester
   * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
   * \param requester whether current node is the requester
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
  //---- recovery data structure ----
  // call sequence counter, records how many calls we made so far
  // from last call to CheckPoint, LoadCheckPoint