check in the recover strategy
This commit is contained in:
parent
155ed3a814
commit
2e536eda29
@ -2,7 +2,9 @@
|
|||||||
#define ALLREDUCE_H
|
#define ALLREDUCE_H
|
||||||
/*!
|
/*!
|
||||||
* \file allreduce.h
|
* \file allreduce.h
|
||||||
* \brief This file defines a template wrapper of engine to ensure
|
* \brief This file defines a template wrapper of engine to give more flexible
|
||||||
|
* AllReduce operations
|
||||||
|
*
|
||||||
* \author Tianqi Chen, Nacho, Tianyi
|
* \author Tianqi Chen, Nacho, Tianyi
|
||||||
*/
|
*/
|
||||||
#include "./engine.h"
|
#include "./engine.h"
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
/*!
|
/*!
|
||||||
* \file engine.h
|
* \file engine.h
|
||||||
* \brief This file defines the interface of allreduce library
|
* \brief This file defines the core interface of allreduce library
|
||||||
* \author Tianqi Chen, Nacho, Tianyi
|
* \author Tianqi Chen, Nacho, Tianyi
|
||||||
*/
|
*/
|
||||||
#ifndef ALLREDUCE_ENGINE_H
|
#ifndef ALLREDUCE_ENGINE_H
|
||||||
|
|||||||
@ -1,3 +1,8 @@
|
|||||||
|
/*!
|
||||||
|
* \file engine_base.cc
|
||||||
|
* \brief Basic implementation of AllReduce
|
||||||
|
* \author Tianqi, Nacho, Tianyi
|
||||||
|
*/
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#define _CRT_SECURE_NO_DEPRECATE
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
@ -137,10 +142,8 @@ void AllReduceBase::SetParam(const char *name, const char *val) {
|
|||||||
* \param type_nbytes the unit number of bytes the type have
|
* \param type_nbytes the unit number of bytes the type have
|
||||||
* \param count number of elements to be reduced
|
* \param count number of elements to be reduced
|
||||||
* \param reducer reduce function
|
* \param reducer reduce function
|
||||||
* \return this function can return
|
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
|
||||||
* - kSuccess: allreduce is success,
|
* \sa ReturnType
|
||||||
* - kSockError: a neighbor node go down, the connection is dropped
|
|
||||||
* - kGetExcept: another node which is not my neighbor go down, get Out-of-Band exception notification from my neighbor
|
|
||||||
*/
|
*/
|
||||||
AllReduceBase::ReturnType
|
AllReduceBase::ReturnType
|
||||||
AllReduceBase::TryAllReduce(void *sendrecvbuf_,
|
AllReduceBase::TryAllReduce(void *sendrecvbuf_,
|
||||||
@ -278,7 +281,8 @@ AllReduceBase::TryAllReduce(void *sendrecvbuf_,
|
|||||||
* \param sendrecvbuf_ buffer for both sending and recving data
|
* \param sendrecvbuf_ buffer for both sending and recving data
|
||||||
* \param total_size the size of the data to be broadcasted
|
* \param total_size the size of the data to be broadcasted
|
||||||
* \param root the root worker id to broadcast the data
|
* \param root the root worker id to broadcast the data
|
||||||
* \return this function can return three possible values, see detail in TryAllReduce
|
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType
|
||||||
*/
|
*/
|
||||||
AllReduceBase::ReturnType
|
AllReduceBase::ReturnType
|
||||||
AllReduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
|
AllReduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
|
||||||
|
|||||||
@ -97,8 +97,14 @@ class AllReduceBase : public IEngine {
|
|||||||
protected:
|
protected:
|
||||||
/*! \brief enumeration of possible returning results from Try functions */
|
/*! \brief enumeration of possible returning results from Try functions */
|
||||||
enum ReturnType {
|
enum ReturnType {
|
||||||
|
/*! \brief execution is successful */
|
||||||
kSuccess,
|
kSuccess,
|
||||||
|
/*! \brief a neighbor node go down, the connection is dropped */
|
||||||
kSockError,
|
kSockError,
|
||||||
|
/*!
|
||||||
|
* \brief another node which is not my neighbor go down,
|
||||||
|
* get Out-of-Band exception notification from my neighbor
|
||||||
|
*/
|
||||||
kGetExcept
|
kGetExcept
|
||||||
};
|
};
|
||||||
// link record to a neighbor
|
// link record to a neighbor
|
||||||
@ -202,10 +208,8 @@ class AllReduceBase : public IEngine {
|
|||||||
* \param type_nbytes the unit number of bytes the type have
|
* \param type_nbytes the unit number of bytes the type have
|
||||||
* \param count number of elements to be reduced
|
* \param count number of elements to be reduced
|
||||||
* \param reducer reduce function
|
* \param reducer reduce function
|
||||||
* \return this function can return
|
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
|
||||||
* - kSuccess: allreduce is success,
|
* \sa ReturnType
|
||||||
* - kSockError: a neighbor node go down, the connection is dropped
|
|
||||||
* - kGetExcept: another node which is not my neighbor go down, get Out-of-Band exception notification from my neighbor
|
|
||||||
*/
|
*/
|
||||||
ReturnType TryAllReduce(void *sendrecvbuf_,
|
ReturnType TryAllReduce(void *sendrecvbuf_,
|
||||||
size_t type_nbytes,
|
size_t type_nbytes,
|
||||||
@ -216,7 +220,8 @@ class AllReduceBase : public IEngine {
|
|||||||
* \param sendrecvbuf_ buffer for both sending and recving data
|
* \param sendrecvbuf_ buffer for both sending and recving data
|
||||||
* \param size the size of the data to be broadcasted
|
* \param size the size of the data to be broadcasted
|
||||||
* \param root the root worker id to broadcast the data
|
* \param root the root worker id to broadcast the data
|
||||||
* \return this function can return three possible values, see detail in TryAllReduce
|
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType
|
||||||
*/
|
*/
|
||||||
ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
|
ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
|
||||||
//---- local data related to link ----
|
//---- local data related to link ----
|
||||||
|
|||||||
@ -1,3 +1,8 @@
|
|||||||
|
/*!
|
||||||
|
* \file engine_robust.cc
|
||||||
|
* \brief Robust implementation of AllReduce
|
||||||
|
* \author Tianqi, Nacho, Tianyi
|
||||||
|
*/
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#define _CRT_SECURE_NO_DEPRECATE
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
@ -72,7 +77,6 @@ AllReduceRobust::ReturnType AllReduceRobust::TryResetLinks(void) {
|
|||||||
links[i].InitBuffer(sizeof(int), 1 << 10, reduce_buffer_size);
|
links[i].InitBuffer(sizeof(int), 1 << 10, reduce_buffer_size);
|
||||||
links[i].ResetSize();
|
links[i].ResetSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
// read and discard data from all channels until pass mark
|
// read and discard data from all channels until pass mark
|
||||||
while (true) {
|
while (true) {
|
||||||
for (int i = 0; i < nlink; ++i) {
|
for (int i = 0; i < nlink; ++i) {
|
||||||
@ -179,12 +183,150 @@ AllReduceRobust::ReturnType AllReduceRobust::TryResetLinks(void) {
|
|||||||
}
|
}
|
||||||
return kSuccess;
|
return kSuccess;
|
||||||
}
|
}
|
||||||
|
/*!
|
||||||
bool AllReduceRobust::RecoverExec(void *sendrecvbuf_, size_t size, int flag, int seqno) {
|
* \brief try to reconnect the broken links
|
||||||
|
* \return this function can kSuccess or kSockError
|
||||||
|
*/
|
||||||
|
AllReduceRobust::ReturnType AllReduceRobust::TryReConnectLinks(void) {
|
||||||
|
utils::Error("TryReConnectLinks: not implemented");
|
||||||
|
return kSuccess;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief if err_type indicates an error
|
||||||
|
* recover links according to the error type reported
|
||||||
|
* if there is no error, return true
|
||||||
|
* \param err_type the type of error happening in the system
|
||||||
|
* \return true if err_type is kSuccess, false otherwise
|
||||||
|
*/
|
||||||
|
bool AllReduceRobust::CheckAndRecover(ReturnType err_type) {
|
||||||
|
if (err_type == kSuccess) return true;
|
||||||
|
while(err_type != kSuccess) {
|
||||||
|
switch(err_type) {
|
||||||
|
case kGetExcept: err_type = TryResetLinks(); break;
|
||||||
|
case kSockError: {
|
||||||
|
TryResetLinks();
|
||||||
|
err_type = TryReConnectLinks();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: utils::Assert(false, "RecoverLinks: cannot reach here");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief try to load check point
|
||||||
|
*
|
||||||
|
* This is a collaborative function called by all nodes
|
||||||
|
* only the nodes with requester set to true really needs to load the check point
|
||||||
|
* other nodes acts as collaborative roles to complete this request
|
||||||
|
*
|
||||||
|
* \param requester whether current node is the requester
|
||||||
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType
|
||||||
|
*/
|
||||||
|
AllReduceRobust::ReturnType AllReduceRobust::TryLoadCheckPoint(bool requester) {
|
||||||
|
utils::Error("TryLoadCheckPoint: not implemented");
|
||||||
|
return kSuccess;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief try to get the result of operation specified by seqno
|
||||||
|
*
|
||||||
|
* This is a collaborative function called by all nodes
|
||||||
|
* only the nodes with requester set to true really needs to get the result
|
||||||
|
* other nodes acts as collaborative roles to complete this request
|
||||||
|
*
|
||||||
|
* \param buf the buffer to store the result, this parameter is only use when current node is requester
|
||||||
|
* \param size the total size of the buffer, this parameter is only use when current node is requester
|
||||||
|
* \param seqno sequence number of the operation, this is unique index of a operation in current iteration
|
||||||
|
* \param requester whether current node is the requester
|
||||||
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType
|
||||||
|
*/
|
||||||
|
AllReduceRobust::ReturnType AllReduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool requester) {
|
||||||
|
utils::Error("TryGetResult: not implemented");
|
||||||
|
return kSuccess;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief try to run recover execution for a request action described by flag and seqno,
|
||||||
|
* the function will keep blocking to run possible recovery operations before the specified action,
|
||||||
|
* until the requested result is received by a recovering procedure,
|
||||||
|
* or the function discovers that the requested action is not yet executed, and return false
|
||||||
|
*
|
||||||
|
* \param buf the buffer to store the result
|
||||||
|
* \param size the total size of the buffer
|
||||||
|
* \param flag flag information about the action \sa ActionSummary
|
||||||
|
* \param seqno sequence number of the action, if it is special action with flag set, seqno needs to be set to ActionSummary::kMaxSeq
|
||||||
|
*
|
||||||
|
* \return if this function can return true or false
|
||||||
|
* - true means buf already set to the
|
||||||
|
* result by recovering procedure, the action is complete, no further action is needed
|
||||||
|
* - false means this is the lastest action that has not yet been executed, need to execute the action
|
||||||
|
*/
|
||||||
|
bool AllReduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
|
||||||
if (flag != 0) {
|
if (flag != 0) {
|
||||||
utils::Assert(seqno == ActionSummary::kMaxSeq, "must only set seqno for normal operations");
|
utils::Assert(seqno == ActionSummary::kMaxSeq, "must only set seqno for normal operations");
|
||||||
}
|
}
|
||||||
ActionSummary act(flag, seqno);
|
// request
|
||||||
|
ActionSummary req(flag, seqno);
|
||||||
|
while (true) {
|
||||||
|
// action
|
||||||
|
ActionSummary act = req;
|
||||||
|
// get the reduced action
|
||||||
|
if (!CheckAndRecover(TryAllReduce(&act, sizeof(act), 1, ActionSummary::Reducer))) continue;
|
||||||
|
if (act.check_ack()) {
|
||||||
|
if (act.check_point()) {
|
||||||
|
// if we also have check_point, do check point first
|
||||||
|
utils::Assert(!act.diff_seq(),
|
||||||
|
"check ack & check pt cannot occur together with normal ops");
|
||||||
|
// if we requested checkpoint, we are free to go
|
||||||
|
if (req.check_point()) return true;
|
||||||
|
} else if (act.load_check()) {
|
||||||
|
// if there is only check_ack and load_check, do load_check
|
||||||
|
if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
|
||||||
|
// if requested load check, then misson complete
|
||||||
|
if (req.load_check()) return true;
|
||||||
|
} else {
|
||||||
|
// there is no check point and no load check, execute check ack
|
||||||
|
if (req.check_ack()) return true;
|
||||||
|
}
|
||||||
|
// if execute to this point
|
||||||
|
// this means the action requested has not been completed
|
||||||
|
// try next round
|
||||||
|
} else {
|
||||||
|
if (act.check_point()) {
|
||||||
|
if (act.diff_seq()) {
|
||||||
|
utils::Assert(act.min_seqno() != ActionSummary::kMaxSeq, "min seq bug");
|
||||||
|
bool requester = req.min_seqno() == act.min_seqno();
|
||||||
|
if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
|
||||||
|
if (requester) return true;
|
||||||
|
} else {
|
||||||
|
// no difference in seq no, means we are free to check point
|
||||||
|
if (req.check_point()) return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// no check point
|
||||||
|
if (act.load_check()) {
|
||||||
|
// load check have higher priority, do load_check
|
||||||
|
if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
|
||||||
|
// if requested load check, then misson complete
|
||||||
|
if (req.load_check()) return true;
|
||||||
|
} else {
|
||||||
|
// no special flags, no checkpoint, check ack, load_check
|
||||||
|
utils::Assert(act.min_seqno() != ActionSummary::kMaxSeq, "min seq bug");
|
||||||
|
if (act.diff_seq()) {
|
||||||
|
bool requester = req.min_seqno() == act.min_seqno();
|
||||||
|
if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
|
||||||
|
if (requester) return true;
|
||||||
|
} else {
|
||||||
|
// all the request is same, this is most recent command that is yet to be executed
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// something is still incomplete try next round
|
||||||
|
}
|
||||||
|
}
|
||||||
|
utils::Assert(false, "RecoverExec: should not reach here");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace engine
|
} // namespace engine
|
||||||
|
|||||||
@ -89,15 +89,19 @@ class AllReduceRobust : public AllReduceBase {
|
|||||||
inline int min_seqno(void) const {
|
inline int min_seqno(void) const {
|
||||||
return seqcode >> 4;
|
return seqcode >> 4;
|
||||||
}
|
}
|
||||||
|
// whether the operation set contains a load_check
|
||||||
|
inline bool load_check(void) const {
|
||||||
|
return (seqcode & kLoadCheck) != 0;
|
||||||
|
}
|
||||||
// whether the operation set contains a check point
|
// whether the operation set contains a check point
|
||||||
inline bool check_point(void) const {
|
inline bool check_point(void) const {
|
||||||
return (seqcode & kCheckPoint) != 0;
|
return (seqcode & kCheckPoint) != 0;
|
||||||
}
|
}
|
||||||
// whether the operation set contains a check point
|
// whether the operation set contains a check ack
|
||||||
inline bool check_ack(void) const {
|
inline bool check_ack(void) const {
|
||||||
return (seqcode & kCheckAck) != 0;
|
return (seqcode & kCheckAck) != 0;
|
||||||
}
|
}
|
||||||
// whether the operation set contains a check point
|
// whether the operation set contains different sequence number
|
||||||
inline bool diff_seq(void) const {
|
inline bool diff_seq(void) const {
|
||||||
return (seqcode & kDiffSeq) != 0;
|
return (seqcode & kDiffSeq) != 0;
|
||||||
}
|
}
|
||||||
@ -186,15 +190,62 @@ class AllReduceRobust : public AllReduceBase {
|
|||||||
*/
|
*/
|
||||||
ReturnType TryResetLinks(void);
|
ReturnType TryResetLinks(void);
|
||||||
/*!
|
/*!
|
||||||
* \brief Run recovery execution of a action specified by flag and seqno,
|
* \brief try to reconnect the broken links
|
||||||
* there can be two outcome of the function
|
* \return this function can kSuccess or kSockError
|
||||||
*
|
|
||||||
* \param sendrecvbuf_
|
|
||||||
*
|
|
||||||
* \return if this function returns true, this means
|
|
||||||
* behind and we will be able to recover data from existing node
|
|
||||||
*/
|
*/
|
||||||
bool RecoverExec(void *sendrecvbuf_, size_t size, int flag, int seqno);
|
ReturnType TryReConnectLinks(void);
|
||||||
|
/*!
|
||||||
|
* \brief if err_type indicates an error
|
||||||
|
* recover links according to the error type reported
|
||||||
|
* if there is no error, return true
|
||||||
|
* \param err_type the type of error happening in the system
|
||||||
|
* \return true if err_type is kSuccess, false otherwise
|
||||||
|
*/
|
||||||
|
bool CheckAndRecover(ReturnType err_type);
|
||||||
|
/*!
|
||||||
|
* \brief try to run recover execution for a request action described by flag and seqno,
|
||||||
|
* the function will keep blocking to run possible recovery operations before the specified action,
|
||||||
|
* until the requested result is received by a recovering procedure,
|
||||||
|
* or the function discovers that the requested action is not yet executed, and return false
|
||||||
|
*
|
||||||
|
* \param buf the buffer to store the result
|
||||||
|
* \param size the total size of the buffer
|
||||||
|
* \param flag flag information about the action \sa ActionSummary
|
||||||
|
* \param seqno sequence number of the action, if it is special action with flag set, seqno needs to be set to ActionSummary::kMaxSeq
|
||||||
|
*
|
||||||
|
* \return if this function can return true or false
|
||||||
|
* - true means buf already set to the
|
||||||
|
* result by recovering procedure, the action is complete, no further action is needed
|
||||||
|
* - false means this is the lastest action that has not yet been executed, need to execute the action
|
||||||
|
*/
|
||||||
|
bool RecoverExec(void *buf, size_t size, int flag, int seqno = ActionSummary::kMaxSeq);
|
||||||
|
/*!
|
||||||
|
* \brief try to load check point
|
||||||
|
*
|
||||||
|
* This is a collaborative function called by all nodes
|
||||||
|
* only the nodes with requester set to true really needs to load the check point
|
||||||
|
* other nodes acts as collaborative roles to complete this request
|
||||||
|
*
|
||||||
|
* \param requester whether current node is the requester
|
||||||
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType
|
||||||
|
*/
|
||||||
|
ReturnType TryLoadCheckPoint(bool requester);
|
||||||
|
/*!
|
||||||
|
* \brief try to get the result of operation specified by seqno
|
||||||
|
*
|
||||||
|
* This is a collaborative function called by all nodes
|
||||||
|
* only the nodes with requester set to true really needs to get the result
|
||||||
|
* other nodes acts as collaborative roles to complete this request
|
||||||
|
*
|
||||||
|
* \param buf the buffer to store the result, this parameter is only use when current node is requester
|
||||||
|
* \param size the total size of the buffer, this parameter is only use when current node is requester
|
||||||
|
* \param seqno sequence number of the operation, this is unique index of a operation in current iteration
|
||||||
|
* \param requester whether current node is the requester
|
||||||
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType
|
||||||
|
*/
|
||||||
|
ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
|
||||||
//---- recovery data structure ----
|
//---- recovery data structure ----
|
||||||
// call sequence counter, records how many calls we made so far
|
// call sequence counter, records how many calls we made so far
|
||||||
// from last call to CheckPoint, LoadCheckPoint
|
// from last call to CheckPoint, LoadCheckPoint
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user