[coll] Implement shutdown for tracker and comm. (#10208)
- Force shutdown the tracker. - Implement shutdown notice for error handling thread in comm.
This commit is contained in:
@@ -100,6 +100,24 @@ std::enable_if_t<std::is_integral_v<E>, xgboost::collective::Result> PollError(E
|
||||
if ((revents & POLLNVAL) != 0) {
|
||||
return xgboost::system::FailWithCode("Invalid polling request.");
|
||||
}
|
||||
if ((revents & POLLHUP) != 0) {
|
||||
// Excerpt from the Linux manual:
|
||||
//
|
||||
// Note that when reading from a channel such as a pipe or a stream socket, this event
|
||||
// merely indicates that the peer closed its end of the channel.Subsequent reads from
|
||||
// the channel will return 0 (end of file) only after all outstanding data in the
|
||||
// channel has been consumed.
|
||||
//
|
||||
// We don't usually have a barrier for exiting workers, it's normal to have one end
|
||||
// exit while the other still reading data.
|
||||
return xgboost::collective::Success();
|
||||
}
|
||||
#if defined(POLLRDHUP)
|
||||
// Linux only flag
|
||||
if ((revents & POLLRDHUP) != 0) {
|
||||
return xgboost::system::FailWithCode("Poll hung up on the other end.");
|
||||
}
|
||||
#endif // defined(POLLRDHUP)
|
||||
return xgboost::collective::Success();
|
||||
}
|
||||
|
||||
@@ -179,9 +197,11 @@ struct PollHelper {
|
||||
}
|
||||
std::int32_t ret = PollImpl(fdset.data(), fdset.size(), timeout);
|
||||
if (ret == 0) {
|
||||
return xgboost::collective::Fail("Poll timeout.", std::make_error_code(std::errc::timed_out));
|
||||
return xgboost::collective::Fail(
|
||||
"Poll timeout:" + std::to_string(timeout.count()) + " seconds.",
|
||||
std::make_error_code(std::errc::timed_out));
|
||||
} else if (ret < 0) {
|
||||
return xgboost::system::FailWithCode("Poll failed.");
|
||||
return xgboost::system::FailWithCode("Poll failed, nfds:" + std::to_string(fdset.size()));
|
||||
}
|
||||
|
||||
for (auto& pfd : fdset) {
|
||||
|
||||
Reference in New Issue
Block a user