ResetLink still not ok
This commit is contained in:
parent
6b18ee9edb
commit
aa2cb38543
@ -283,7 +283,17 @@ AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (true) {
|
while (true) {
|
||||||
|
utils::SelectHelper rsel;
|
||||||
|
bool finished = true;
|
||||||
for (int i = 0; i < nlink; ++i) {
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (all_links[i].size_read == 0 && !all_links[i].sock.BadSocket()) {
|
||||||
|
rsel.WatchRead(all_links[i].sock); finished = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (finished) break;
|
||||||
|
rsel.Select();
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (all_links[i].sock.BadSocket()) continue;
|
||||||
if (all_links[i].size_read == 0) {
|
if (all_links[i].size_read == 0) {
|
||||||
int atmark = all_links[i].sock.AtMark();
|
int atmark = all_links[i].sock.AtMark();
|
||||||
if (atmark < 0) {
|
if (atmark < 0) {
|
||||||
@ -299,17 +309,7 @@ AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
utils::SelectHelper rsel;
|
|
||||||
bool finished = true;
|
|
||||||
for (int i = 0; i < nlink; ++i) {
|
|
||||||
if (all_links[i].size_read == 0 && !all_links[i].sock.BadSocket()) {
|
|
||||||
rsel.WatchRead(all_links[i].sock); finished = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (finished) break;
|
|
||||||
rsel.Select();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// start synchronization, use blocking I/O to avoid select
|
// start synchronization, use blocking I/O to avoid select
|
||||||
for (int i = 0; i < nlink; ++i) {
|
for (int i = 0; i < nlink; ++i) {
|
||||||
if (!all_links[i].sock.BadSocket()) {
|
if (!all_links[i].sock.BadSocket()) {
|
||||||
@ -365,13 +365,15 @@ AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
|
|||||||
*/
|
*/
|
||||||
bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
|
bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
|
||||||
if (err_type == kSuccess) return true;
|
if (err_type == kSuccess) return true;
|
||||||
// simple way, shutdown all links
|
{// simple way, shutdown all links
|
||||||
for (size_t i = 0; i < all_links.size(); ++i) {
|
for (size_t i = 0; i < all_links.size(); ++i) {
|
||||||
if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
|
if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
|
||||||
|
}
|
||||||
|
ReConnectLinks("recover");
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
ReConnectLinks("recover");
|
|
||||||
return false;
|
|
||||||
// this was old way
|
// this was old way
|
||||||
|
// TryResetLinks still causes possible errors, so not use this one
|
||||||
while(err_type != kSuccess) {
|
while(err_type != kSuccess) {
|
||||||
switch(err_type) {
|
switch(err_type) {
|
||||||
case kGetExcept: err_type = TryResetLinks(); break;
|
case kGetExcept: err_type = TryResetLinks(); break;
|
||||||
|
|||||||
@ -95,7 +95,11 @@ class AllreduceRobust : public AllreduceBase {
|
|||||||
* this function is only used for test purpose
|
* this function is only used for test purpose
|
||||||
*/
|
*/
|
||||||
virtual void InitAfterException(void) {
|
virtual void InitAfterException(void) {
|
||||||
this->CheckAndRecover(kGetExcept);
|
// simple way, shutdown all links
|
||||||
|
for (size_t i = 0; i < all_links.size(); ++i) {
|
||||||
|
if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
|
||||||
|
}
|
||||||
|
ReConnectLinks("recover");
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user