Fix error handling in the event loop. (#9990)
This commit is contained in:
parent
0798e36d73
commit
85d09245f6
@ -1,11 +1,19 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2023, XGBoost Contributors
|
* Copyright 2023-2024, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include "loop.h"
|
#include "loop.h"
|
||||||
|
|
||||||
|
#include <cstddef> // for size_t
|
||||||
|
#include <cstdint> // for int32_t
|
||||||
|
#include <exception> // for exception, current_exception, rethrow_exception
|
||||||
|
#include <mutex> // for lock_guard, unique_lock
|
||||||
#include <queue> // for queue
|
#include <queue> // for queue
|
||||||
|
#include <string> // for string
|
||||||
|
#include <thread> // for thread
|
||||||
|
#include <utility> // for move
|
||||||
|
|
||||||
#include "rabit/internal/socket.h" // for PollHelper
|
#include "rabit/internal/socket.h" // for PollHelper
|
||||||
|
#include "xgboost/collective/result.h" // for Fail, Success
|
||||||
#include "xgboost/collective/socket.h" // for FailWithCode
|
#include "xgboost/collective/socket.h" // for FailWithCode
|
||||||
#include "xgboost/logging.h" // for CHECK
|
#include "xgboost/logging.h" // for CHECK
|
||||||
|
|
||||||
@ -109,62 +117,94 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Loop::Process() {
|
void Loop::Process() {
|
||||||
// consumer
|
auto set_rc = [this](Result&& rc) {
|
||||||
|
std::lock_guard lock{rc_lock_};
|
||||||
|
rc_ = std::forward<Result>(rc);
|
||||||
|
};
|
||||||
|
|
||||||
|
// This loop cannot exit unless `stop_` is set to true. There must always be a thread to
|
||||||
|
// answer the blocking call even if there are errors, otherwise the blocking will wait
|
||||||
|
// forever.
|
||||||
while (true) {
|
while (true) {
|
||||||
|
try {
|
||||||
std::unique_lock lock{mu_};
|
std::unique_lock lock{mu_};
|
||||||
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
|
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
|
||||||
if (stop_) {
|
if (stop_) {
|
||||||
break;
|
break; // only point where this loop can exit.
|
||||||
}
|
}
|
||||||
|
|
||||||
auto unlock_notify = [&](bool is_blocking, bool stop) {
|
// Move the global queue into a local variable to unblock it.
|
||||||
if (!is_blocking) {
|
|
||||||
std::lock_guard guard{mu_};
|
|
||||||
stop_ = stop;
|
|
||||||
} else {
|
|
||||||
stop_ = stop;
|
|
||||||
lock.unlock();
|
|
||||||
}
|
|
||||||
cv_.notify_one();
|
|
||||||
};
|
|
||||||
|
|
||||||
// move the queue
|
|
||||||
std::queue<Op> qcopy;
|
std::queue<Op> qcopy;
|
||||||
|
|
||||||
bool is_blocking = false;
|
bool is_blocking = false;
|
||||||
while (!queue_.empty()) {
|
while (!queue_.empty()) {
|
||||||
auto op = queue_.front();
|
auto op = queue_.front();
|
||||||
queue_.pop();
|
queue_.pop();
|
||||||
if (op.code == Op::kBlock) {
|
if (op.code == Op::kBlock) {
|
||||||
is_blocking = true;
|
is_blocking = true;
|
||||||
|
// Block must be the last op in the current batch since no further submit can be
|
||||||
|
// issued until the blocking call is finished.
|
||||||
|
CHECK(queue_.empty());
|
||||||
} else {
|
} else {
|
||||||
qcopy.push(op);
|
qcopy.push(op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// unblock the queue
|
|
||||||
if (!is_blocking) {
|
if (!is_blocking) {
|
||||||
|
// Unblock, we can write to the global queue again.
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
}
|
}
|
||||||
// clear the queue
|
|
||||||
|
// Clear the local queue, this is blocking the current worker thread (but not the
|
||||||
|
// client thread), wait until all operations are finished.
|
||||||
auto rc = this->EmptyQueue(&qcopy);
|
auto rc = this->EmptyQueue(&qcopy);
|
||||||
// Handle error
|
|
||||||
if (!rc.OK()) {
|
if (is_blocking) {
|
||||||
unlock_notify(is_blocking, true);
|
// The unlock is delayed if this is a blocking call
|
||||||
std::lock_guard<std::mutex> guard{rc_lock_};
|
lock.unlock();
|
||||||
this->rc_ = std::move(rc);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Notify the client thread who called block after all error conditions are set.
|
||||||
|
auto notify_if_block = [&] {
|
||||||
|
if (is_blocking) {
|
||||||
|
std::unique_lock lock{mu_};
|
||||||
|
block_done_ = true;
|
||||||
|
lock.unlock();
|
||||||
|
block_cv_.notify_one();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle error
|
||||||
|
if (!rc.OK()) {
|
||||||
|
set_rc(std::move(rc));
|
||||||
|
} else {
|
||||||
CHECK(qcopy.empty());
|
CHECK(qcopy.empty());
|
||||||
unlock_notify(is_blocking, false);
|
}
|
||||||
|
|
||||||
|
notify_if_block();
|
||||||
|
} catch (std::exception const& e) {
|
||||||
|
curr_exce_ = std::current_exception();
|
||||||
|
set_rc(Fail("Exception inside the event loop:" + std::string{e.what()}));
|
||||||
|
} catch (...) {
|
||||||
|
curr_exce_ = std::current_exception();
|
||||||
|
set_rc(Fail("Unknown exception inside the event loop."));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Result Loop::Stop() {
|
Result Loop::Stop() {
|
||||||
|
// Finish all remaining tasks
|
||||||
|
CHECK_EQ(this->Block().OK(), this->rc_.OK());
|
||||||
|
|
||||||
|
// Notify the loop to stop
|
||||||
std::unique_lock lock{mu_};
|
std::unique_lock lock{mu_};
|
||||||
stop_ = true;
|
stop_ = true;
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
|
this->cv_.notify_one();
|
||||||
|
|
||||||
CHECK_EQ(this->Block().OK(), this->rc_.OK());
|
if (this->worker_.joinable()) {
|
||||||
|
this->worker_.join();
|
||||||
|
}
|
||||||
|
|
||||||
if (curr_exce_) {
|
if (curr_exce_) {
|
||||||
std::rethrow_exception(curr_exce_);
|
std::rethrow_exception(curr_exce_);
|
||||||
@ -175,17 +215,29 @@ Result Loop::Stop() {
|
|||||||
|
|
||||||
[[nodiscard]] Result Loop::Block() {
|
[[nodiscard]] Result Loop::Block() {
|
||||||
{
|
{
|
||||||
|
// Check whether the last op was successful, stop if not.
|
||||||
std::lock_guard<std::mutex> guard{rc_lock_};
|
std::lock_guard<std::mutex> guard{rc_lock_};
|
||||||
if (!rc_.OK()) {
|
if (!rc_.OK()) {
|
||||||
return std::move(rc_);
|
stop_ = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!this->worker_.joinable()) {
|
||||||
|
std::lock_guard<std::mutex> guard{rc_lock_};
|
||||||
|
return Fail("Worker has stopped.", std::move(rc_));
|
||||||
|
}
|
||||||
|
|
||||||
this->Submit(Op{Op::kBlock});
|
this->Submit(Op{Op::kBlock});
|
||||||
|
|
||||||
{
|
{
|
||||||
|
// Wait for the block call to finish.
|
||||||
std::unique_lock lock{mu_};
|
std::unique_lock lock{mu_};
|
||||||
cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
|
block_cv_.wait(lock, [this] { return block_done_ || stop_; });
|
||||||
|
block_done_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
// Transfer the rc.
|
||||||
std::lock_guard<std::mutex> lock{rc_lock_};
|
std::lock_guard<std::mutex> lock{rc_lock_};
|
||||||
return std::move(rc_);
|
return std::move(rc_);
|
||||||
}
|
}
|
||||||
@ -193,26 +245,6 @@ Result Loop::Stop() {
|
|||||||
|
|
||||||
Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
|
Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
|
||||||
timer_.Init(__func__);
|
timer_.Init(__func__);
|
||||||
worker_ = std::thread{[this] {
|
worker_ = std::thread{[this] { this->Process(); }};
|
||||||
try {
|
|
||||||
this->Process();
|
|
||||||
} catch (std::exception const& e) {
|
|
||||||
std::lock_guard<std::mutex> guard{mu_};
|
|
||||||
if (!curr_exce_) {
|
|
||||||
curr_exce_ = std::current_exception();
|
|
||||||
rc_ = Fail("Exception was thrown");
|
|
||||||
}
|
|
||||||
stop_ = true;
|
|
||||||
cv_.notify_all();
|
|
||||||
} catch (...) {
|
|
||||||
std::lock_guard<std::mutex> guard{mu_};
|
|
||||||
if (!curr_exce_) {
|
|
||||||
curr_exce_ = std::current_exception();
|
|
||||||
rc_ = Fail("Exception was thrown");
|
|
||||||
}
|
|
||||||
stop_ = true;
|
|
||||||
cv_.notify_all();
|
|
||||||
}
|
|
||||||
}};
|
|
||||||
}
|
}
|
||||||
} // namespace xgboost::collective
|
} // namespace xgboost::collective
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2023, XGBoost Contributors
|
* Copyright 2023-2024, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <chrono> // for seconds
|
#include <chrono> // for seconds
|
||||||
@ -10,7 +10,6 @@
|
|||||||
#include <mutex> // for unique_lock, mutex
|
#include <mutex> // for unique_lock, mutex
|
||||||
#include <queue> // for queue
|
#include <queue> // for queue
|
||||||
#include <thread> // for thread
|
#include <thread> // for thread
|
||||||
#include <utility> // for move
|
|
||||||
|
|
||||||
#include "../common/timer.h" // for Monitor
|
#include "../common/timer.h" // for Monitor
|
||||||
#include "xgboost/collective/result.h" // for Result
|
#include "xgboost/collective/result.h" // for Result
|
||||||
@ -37,10 +36,15 @@ class Loop {
|
|||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::thread worker_;
|
std::thread worker_; // thread worker to execute the tasks
|
||||||
std::condition_variable cv_;
|
|
||||||
std::mutex mu_;
|
std::condition_variable cv_; // CV used to notify a new submit call
|
||||||
std::queue<Op> queue_;
|
std::condition_variable block_cv_; // CV used to notify the blocking call
|
||||||
|
bool block_done_{false}; // Flag to indicate whether the blocking call has finished.
|
||||||
|
|
||||||
|
std::queue<Op> queue_; // event queue
|
||||||
|
std::mutex mu_; // mutex to protect the queue, cv, and block_done
|
||||||
|
|
||||||
std::chrono::seconds timeout_;
|
std::chrono::seconds timeout_;
|
||||||
|
|
||||||
Result rc_;
|
Result rc_;
|
||||||
@ -51,29 +55,33 @@ class Loop {
|
|||||||
common::Monitor mutable timer_;
|
common::Monitor mutable timer_;
|
||||||
|
|
||||||
Result EmptyQueue(std::queue<Op>* p_queue) const;
|
Result EmptyQueue(std::queue<Op>* p_queue) const;
|
||||||
|
// The cunsumer function that runs inside a worker thread.
|
||||||
void Process();
|
void Process();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Stop the worker thread.
|
||||||
|
*/
|
||||||
Result Stop();
|
Result Stop();
|
||||||
|
|
||||||
void Submit(Op op) {
|
void Submit(Op op) {
|
||||||
// producer
|
|
||||||
std::unique_lock lock{mu_};
|
std::unique_lock lock{mu_};
|
||||||
queue_.push(op);
|
queue_.push(op);
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
cv_.notify_one();
|
cv_.notify_one();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Block the event loop until all ops are finished. In the case of failure, this
|
||||||
|
* loop should be not be used for new operations.
|
||||||
|
*/
|
||||||
[[nodiscard]] Result Block();
|
[[nodiscard]] Result Block();
|
||||||
|
|
||||||
explicit Loop(std::chrono::seconds timeout);
|
explicit Loop(std::chrono::seconds timeout);
|
||||||
|
|
||||||
~Loop() noexcept(false) {
|
~Loop() noexcept(false) {
|
||||||
|
// The worker will be joined in the stop function.
|
||||||
this->Stop();
|
this->Stop();
|
||||||
|
|
||||||
if (worker_.joinable()) {
|
|
||||||
worker_.join();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace xgboost::collective
|
} // namespace xgboost::collective
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user