Fix error handling in the event loop. (#9990)

This commit is contained in:
Jiaming Yuan 2024-01-17 05:35:35 +08:00 committed by GitHub
parent 0798e36d73
commit 85d09245f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 120 additions and 80 deletions

View File

@ -1,11 +1,19 @@
/**
* Copyright 2023, XGBoost Contributors
* Copyright 2023-2024, XGBoost Contributors
*/
#include "loop.h"
#include <queue> // for queue
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <exception> // for exception, current_exception, rethrow_exception
#include <mutex> // for lock_guard, unique_lock
#include <queue> // for queue
#include <string> // for string
#include <thread> // for thread
#include <utility> // for move
#include "rabit/internal/socket.h" // for PollHelper
#include "xgboost/collective/result.h" // for Fail, Success
#include "xgboost/collective/socket.h" // for FailWithCode
#include "xgboost/logging.h" // for CHECK
@ -109,62 +117,94 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
}
void Loop::Process() {
// consumer
while (true) {
std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
if (stop_) {
break;
}
auto set_rc = [this](Result&& rc) {
std::lock_guard lock{rc_lock_};
rc_ = std::forward<Result>(rc);
};
// This loop cannot exit unless `stop_` is set to true. There must always be a thread to
// answer the blocking call even if there are errors, otherwise the blocking will wait
// forever.
while (true) {
try {
std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
if (stop_) {
break; // only point where this loop can exit.
}
// Move the global queue into a local variable to unblock it.
std::queue<Op> qcopy;
bool is_blocking = false;
while (!queue_.empty()) {
auto op = queue_.front();
queue_.pop();
if (op.code == Op::kBlock) {
is_blocking = true;
// Block must be the last op in the current batch since no further submit can be
// issued until the blocking call is finished.
CHECK(queue_.empty());
} else {
qcopy.push(op);
}
}
auto unlock_notify = [&](bool is_blocking, bool stop) {
if (!is_blocking) {
std::lock_guard guard{mu_};
stop_ = stop;
} else {
stop_ = stop;
// Unblock, we can write to the global queue again.
lock.unlock();
}
cv_.notify_one();
};
// move the queue
std::queue<Op> qcopy;
bool is_blocking = false;
while (!queue_.empty()) {
auto op = queue_.front();
queue_.pop();
if (op.code == Op::kBlock) {
is_blocking = true;
} else {
qcopy.push(op);
// Clear the local queue, this is blocking the current worker thread (but not the
// client thread), wait until all operations are finished.
auto rc = this->EmptyQueue(&qcopy);
if (is_blocking) {
// The unlock is delayed if this is a blocking call
lock.unlock();
}
}
// unblock the queue
if (!is_blocking) {
lock.unlock();
}
// clear the queue
auto rc = this->EmptyQueue(&qcopy);
// Handle error
if (!rc.OK()) {
unlock_notify(is_blocking, true);
std::lock_guard<std::mutex> guard{rc_lock_};
this->rc_ = std::move(rc);
return;
}
CHECK(qcopy.empty());
unlock_notify(is_blocking, false);
// Notify the client thread who called block after all error conditions are set.
auto notify_if_block = [&] {
if (is_blocking) {
std::unique_lock lock{mu_};
block_done_ = true;
lock.unlock();
block_cv_.notify_one();
}
};
// Handle error
if (!rc.OK()) {
set_rc(std::move(rc));
} else {
CHECK(qcopy.empty());
}
notify_if_block();
} catch (std::exception const& e) {
curr_exce_ = std::current_exception();
set_rc(Fail("Exception inside the event loop:" + std::string{e.what()}));
} catch (...) {
curr_exce_ = std::current_exception();
set_rc(Fail("Unknown exception inside the event loop."));
}
}
}
Result Loop::Stop() {
// Finish all remaining tasks
CHECK_EQ(this->Block().OK(), this->rc_.OK());
// Notify the loop to stop
std::unique_lock lock{mu_};
stop_ = true;
lock.unlock();
this->cv_.notify_one();
CHECK_EQ(this->Block().OK(), this->rc_.OK());
if (this->worker_.joinable()) {
this->worker_.join();
}
if (curr_exce_) {
std::rethrow_exception(curr_exce_);
@ -175,17 +215,29 @@ Result Loop::Stop() {
[[nodiscard]] Result Loop::Block() {
{
// Check whether the last op was successful, stop if not.
std::lock_guard<std::mutex> guard{rc_lock_};
if (!rc_.OK()) {
return std::move(rc_);
stop_ = true;
}
}
this->Submit(Op{Op::kBlock});
{
std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
if (!this->worker_.joinable()) {
std::lock_guard<std::mutex> guard{rc_lock_};
return Fail("Worker has stopped.", std::move(rc_));
}
this->Submit(Op{Op::kBlock});
{
// Wait for the block call to finish.
std::unique_lock lock{mu_};
block_cv_.wait(lock, [this] { return block_done_ || stop_; });
block_done_ = false;
}
{
// Transfer the rc.
std::lock_guard<std::mutex> lock{rc_lock_};
return std::move(rc_);
}
@ -193,26 +245,6 @@ Result Loop::Stop() {
Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
timer_.Init(__func__);
worker_ = std::thread{[this] {
try {
this->Process();
} catch (std::exception const& e) {
std::lock_guard<std::mutex> guard{mu_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
rc_ = Fail("Exception was thrown");
}
stop_ = true;
cv_.notify_all();
} catch (...) {
std::lock_guard<std::mutex> guard{mu_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
rc_ = Fail("Exception was thrown");
}
stop_ = true;
cv_.notify_all();
}
}};
worker_ = std::thread{[this] { this->Process(); }};
}
} // namespace xgboost::collective

View File

@ -1,5 +1,5 @@
/**
* Copyright 2023, XGBoost Contributors
* Copyright 2023-2024, XGBoost Contributors
*/
#pragma once
#include <chrono> // for seconds
@ -10,7 +10,6 @@
#include <mutex> // for unique_lock, mutex
#include <queue> // for queue
#include <thread> // for thread
#include <utility> // for move
#include "../common/timer.h" // for Monitor
#include "xgboost/collective/result.h" // for Result
@ -37,10 +36,15 @@ class Loop {
};
private:
std::thread worker_;
std::condition_variable cv_;
std::mutex mu_;
std::queue<Op> queue_;
std::thread worker_; // thread worker to execute the tasks
std::condition_variable cv_; // CV used to notify a new submit call
std::condition_variable block_cv_; // CV used to notify the blocking call
bool block_done_{false}; // Flag to indicate whether the blocking call has finished.
std::queue<Op> queue_; // event queue
std::mutex mu_; // mutex to protect the queue, cv, and block_done
std::chrono::seconds timeout_;
Result rc_;
@ -51,29 +55,33 @@ class Loop {
common::Monitor mutable timer_;
Result EmptyQueue(std::queue<Op>* p_queue) const;
// The cunsumer function that runs inside a worker thread.
void Process();
public:
/**
* @brief Stop the worker thread.
*/
Result Stop();
void Submit(Op op) {
// producer
std::unique_lock lock{mu_};
queue_.push(op);
lock.unlock();
cv_.notify_one();
}
/**
* @brief Block the event loop until all ops are finished. In the case of failure, this
* loop should be not be used for new operations.
*/
[[nodiscard]] Result Block();
explicit Loop(std::chrono::seconds timeout);
~Loop() noexcept(false) {
// The worker will be joined in the stop function.
this->Stop();
if (worker_.joinable()) {
worker_.join();
}
}
};
} // namespace xgboost::collective