Fix error handling in the event loop. (#9990)

This commit is contained in:
Jiaming Yuan 2024-01-17 05:35:35 +08:00 committed by GitHub
parent 0798e36d73
commit 85d09245f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 120 additions and 80 deletions

View File

@ -1,11 +1,19 @@
/** /**
* Copyright 2023, XGBoost Contributors * Copyright 2023-2024, XGBoost Contributors
*/ */
#include "loop.h" #include "loop.h"
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <exception> // for exception, current_exception, rethrow_exception
#include <mutex> // for lock_guard, unique_lock
#include <queue> // for queue #include <queue> // for queue
#include <string> // for string
#include <thread> // for thread
#include <utility> // for move
#include "rabit/internal/socket.h" // for PollHelper #include "rabit/internal/socket.h" // for PollHelper
#include "xgboost/collective/result.h" // for Fail, Success
#include "xgboost/collective/socket.h" // for FailWithCode #include "xgboost/collective/socket.h" // for FailWithCode
#include "xgboost/logging.h" // for CHECK #include "xgboost/logging.h" // for CHECK
@ -109,62 +117,94 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
} }
void Loop::Process() { void Loop::Process() {
// consumer auto set_rc = [this](Result&& rc) {
std::lock_guard lock{rc_lock_};
rc_ = std::forward<Result>(rc);
};
// This loop cannot exit unless `stop_` is set to true. There must always be a thread to
// answer the blocking call even if there are errors, otherwise the blocking will wait
// forever.
while (true) { while (true) {
try {
std::unique_lock lock{mu_}; std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; }); cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
if (stop_) { if (stop_) {
break; break; // only point where this loop can exit.
} }
auto unlock_notify = [&](bool is_blocking, bool stop) { // Move the global queue into a local variable to unblock it.
if (!is_blocking) {
std::lock_guard guard{mu_};
stop_ = stop;
} else {
stop_ = stop;
lock.unlock();
}
cv_.notify_one();
};
// move the queue
std::queue<Op> qcopy; std::queue<Op> qcopy;
bool is_blocking = false; bool is_blocking = false;
while (!queue_.empty()) { while (!queue_.empty()) {
auto op = queue_.front(); auto op = queue_.front();
queue_.pop(); queue_.pop();
if (op.code == Op::kBlock) { if (op.code == Op::kBlock) {
is_blocking = true; is_blocking = true;
// Block must be the last op in the current batch since no further submit can be
// issued until the blocking call is finished.
CHECK(queue_.empty());
} else { } else {
qcopy.push(op); qcopy.push(op);
} }
} }
// unblock the queue
if (!is_blocking) { if (!is_blocking) {
// Unblock, we can write to the global queue again.
lock.unlock(); lock.unlock();
} }
// clear the queue
// Clear the local queue, this is blocking the current worker thread (but not the
// client thread), wait until all operations are finished.
auto rc = this->EmptyQueue(&qcopy); auto rc = this->EmptyQueue(&qcopy);
// Handle error
if (!rc.OK()) { if (is_blocking) {
unlock_notify(is_blocking, true); // The unlock is delayed if this is a blocking call
std::lock_guard<std::mutex> guard{rc_lock_}; lock.unlock();
this->rc_ = std::move(rc);
return;
} }
// Notify the client thread who called block after all error conditions are set.
auto notify_if_block = [&] {
if (is_blocking) {
std::unique_lock lock{mu_};
block_done_ = true;
lock.unlock();
block_cv_.notify_one();
}
};
// Handle error
if (!rc.OK()) {
set_rc(std::move(rc));
} else {
CHECK(qcopy.empty()); CHECK(qcopy.empty());
unlock_notify(is_blocking, false); }
notify_if_block();
} catch (std::exception const& e) {
curr_exce_ = std::current_exception();
set_rc(Fail("Exception inside the event loop:" + std::string{e.what()}));
} catch (...) {
curr_exce_ = std::current_exception();
set_rc(Fail("Unknown exception inside the event loop."));
}
} }
} }
Result Loop::Stop() { Result Loop::Stop() {
// Finish all remaining tasks
CHECK_EQ(this->Block().OK(), this->rc_.OK());
// Notify the loop to stop
std::unique_lock lock{mu_}; std::unique_lock lock{mu_};
stop_ = true; stop_ = true;
lock.unlock(); lock.unlock();
this->cv_.notify_one();
CHECK_EQ(this->Block().OK(), this->rc_.OK()); if (this->worker_.joinable()) {
this->worker_.join();
}
if (curr_exce_) { if (curr_exce_) {
std::rethrow_exception(curr_exce_); std::rethrow_exception(curr_exce_);
@ -175,17 +215,29 @@ Result Loop::Stop() {
[[nodiscard]] Result Loop::Block() { [[nodiscard]] Result Loop::Block() {
{ {
// Check whether the last op was successful, stop if not.
std::lock_guard<std::mutex> guard{rc_lock_}; std::lock_guard<std::mutex> guard{rc_lock_};
if (!rc_.OK()) { if (!rc_.OK()) {
return std::move(rc_); stop_ = true;
} }
} }
if (!this->worker_.joinable()) {
std::lock_guard<std::mutex> guard{rc_lock_};
return Fail("Worker has stopped.", std::move(rc_));
}
this->Submit(Op{Op::kBlock}); this->Submit(Op{Op::kBlock});
{ {
// Wait for the block call to finish.
std::unique_lock lock{mu_}; std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; }); block_cv_.wait(lock, [this] { return block_done_ || stop_; });
block_done_ = false;
} }
{ {
// Transfer the rc.
std::lock_guard<std::mutex> lock{rc_lock_}; std::lock_guard<std::mutex> lock{rc_lock_};
return std::move(rc_); return std::move(rc_);
} }
@ -193,26 +245,6 @@ Result Loop::Stop() {
Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} { Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
timer_.Init(__func__); timer_.Init(__func__);
worker_ = std::thread{[this] { worker_ = std::thread{[this] { this->Process(); }};
try {
this->Process();
} catch (std::exception const& e) {
std::lock_guard<std::mutex> guard{mu_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
rc_ = Fail("Exception was thrown");
}
stop_ = true;
cv_.notify_all();
} catch (...) {
std::lock_guard<std::mutex> guard{mu_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
rc_ = Fail("Exception was thrown");
}
stop_ = true;
cv_.notify_all();
}
}};
} }
} // namespace xgboost::collective } // namespace xgboost::collective

View File

@ -1,5 +1,5 @@
/** /**
* Copyright 2023, XGBoost Contributors * Copyright 2023-2024, XGBoost Contributors
*/ */
#pragma once #pragma once
#include <chrono> // for seconds #include <chrono> // for seconds
@ -10,7 +10,6 @@
#include <mutex> // for unique_lock, mutex #include <mutex> // for unique_lock, mutex
#include <queue> // for queue #include <queue> // for queue
#include <thread> // for thread #include <thread> // for thread
#include <utility> // for move
#include "../common/timer.h" // for Monitor #include "../common/timer.h" // for Monitor
#include "xgboost/collective/result.h" // for Result #include "xgboost/collective/result.h" // for Result
@ -37,10 +36,15 @@ class Loop {
}; };
private: private:
std::thread worker_; std::thread worker_; // thread worker to execute the tasks
std::condition_variable cv_;
std::mutex mu_; std::condition_variable cv_; // CV used to notify a new submit call
std::queue<Op> queue_; std::condition_variable block_cv_; // CV used to notify the blocking call
bool block_done_{false}; // Flag to indicate whether the blocking call has finished.
std::queue<Op> queue_; // event queue
std::mutex mu_; // mutex to protect the queue, cv, and block_done
std::chrono::seconds timeout_; std::chrono::seconds timeout_;
Result rc_; Result rc_;
@ -51,29 +55,33 @@ class Loop {
common::Monitor mutable timer_; common::Monitor mutable timer_;
Result EmptyQueue(std::queue<Op>* p_queue) const; Result EmptyQueue(std::queue<Op>* p_queue) const;
// The cunsumer function that runs inside a worker thread.
void Process(); void Process();
public: public:
/**
* @brief Stop the worker thread.
*/
Result Stop(); Result Stop();
void Submit(Op op) { void Submit(Op op) {
// producer
std::unique_lock lock{mu_}; std::unique_lock lock{mu_};
queue_.push(op); queue_.push(op);
lock.unlock(); lock.unlock();
cv_.notify_one(); cv_.notify_one();
} }
/**
* @brief Block the event loop until all ops are finished. In the case of failure, this
* loop should be not be used for new operations.
*/
[[nodiscard]] Result Block(); [[nodiscard]] Result Block();
explicit Loop(std::chrono::seconds timeout); explicit Loop(std::chrono::seconds timeout);
~Loop() noexcept(false) { ~Loop() noexcept(false) {
// The worker will be joined in the stop function.
this->Stop(); this->Stop();
if (worker_.joinable()) {
worker_.join();
}
} }
}; };
} // namespace xgboost::collective } // namespace xgboost::collective