Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features:
- Federated learning for both CPU and GPU.
- NCCL.
- More data types.
- A unified interface for all the underlying implementations.
- Improved timeout handling for both tracker and workers.
- Exhausted tests with metrics (fixed a couple of bugs along the way).
- A reusable tracker for Python and JVM packages.
This commit is contained in:
Jiaming Yuan
2024-05-20 11:56:23 +08:00
committed by GitHub
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions

View File

@@ -55,10 +55,9 @@ struct ResultImpl {
#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__MINGW32__)
#define __builtin_FILE() nullptr
#define __builtin_LINE() (-1)
std::string MakeMsg(std::string&& msg, char const*, std::int32_t);
#else
std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line);
#endif
std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line);
} // namespace detail
/**

View File

@@ -16,6 +16,10 @@
#include <system_error> // std::error_code, std::system_category
#include <utility> // std::swap
#if defined(__linux__)
#include <sys/ioctl.h> // for TIOCOUTQ, FIONREAD
#endif // defined(__linux__)
#if !defined(xgboost_IS_MINGW)
#if defined(__MINGW32__)
@@ -319,7 +323,8 @@ class TCPSocket {
std::int32_t domain;
socklen_t len = sizeof(domain);
xgboost_CHECK_SYS_CALL(
getsockopt(handle_, SOL_SOCKET, SO_DOMAIN, reinterpret_cast<char *>(&domain), &len), 0);
getsockopt(this->Handle(), SOL_SOCKET, SO_DOMAIN, reinterpret_cast<char *>(&domain), &len),
0);
return ret_iafamily(domain);
#else
struct sockaddr sa;
@@ -426,6 +431,35 @@ class TCPSocket {
return Success();
}
[[nodiscard]] Result SendBufSize(std::int32_t *n_bytes) {
socklen_t optlen;
auto rc = getsockopt(this->Handle(), SOL_SOCKET, SO_SNDBUF, reinterpret_cast<char *>(n_bytes),
&optlen);
if (rc != 0 || optlen != sizeof(std::int32_t)) {
return system::FailWithCode("getsockopt");
}
return Success();
}
[[nodiscard]] Result RecvBufSize(std::int32_t *n_bytes) {
socklen_t optlen;
auto rc = getsockopt(this->Handle(), SOL_SOCKET, SO_RCVBUF, reinterpret_cast<char *>(n_bytes),
&optlen);
if (rc != 0 || optlen != sizeof(std::int32_t)) {
return system::FailWithCode("getsockopt");
}
return Success();
}
#if defined(__linux__)
[[nodiscard]] Result PendingSendSize(std::int32_t *n_bytes) const {
return ioctl(this->Handle(), TIOCOUTQ, n_bytes) == 0 ? Success()
: system::FailWithCode("ioctl");
}
[[nodiscard]] Result PendingRecvSize(std::int32_t *n_bytes) const {
return ioctl(this->Handle(), FIONREAD, n_bytes) == 0 ? Success()
: system::FailWithCode("ioctl");
}
#endif // defined(__linux__)
[[nodiscard]] Result SetKeepAlive() {
std::int32_t keepalive = 1;
auto rc = setsockopt(handle_, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char *>(&keepalive),
@@ -436,10 +470,9 @@ class TCPSocket {
return Success();
}
[[nodiscard]] Result SetNoDelay() {
std::int32_t tcp_no_delay = 1;
auto rc = setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&tcp_no_delay),
sizeof(tcp_no_delay));
[[nodiscard]] Result SetNoDelay(std::int32_t no_delay = 1) {
auto rc = setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&no_delay),
sizeof(no_delay));
if (rc != 0) {
return system::FailWithCode("Failed to set TCP no delay.");
}
@@ -602,45 +635,47 @@ class TCPSocket {
}
/**
* \brief Send data, without error then all data should be sent.
* @brief Send data, without error then all data should be sent.
*/
[[nodiscard]] auto SendAll(void const *buf, std::size_t len) {
[[nodiscard]] Result SendAll(void const *buf, std::size_t len, std::size_t *n_sent) {
char const *_buf = reinterpret_cast<const char *>(buf);
std::size_t ndone = 0;
std::size_t &ndone = *n_sent;
ndone = 0;
while (ndone < len) {
ssize_t ret = send(handle_, _buf, len - ndone, 0);
if (ret == -1) {
if (system::LastErrorWouldBlock()) {
return ndone;
return Success();
}
system::ThrowAtError("send");
return system::FailWithCode("send");
}
_buf += ret;
ndone += ret;
}
return ndone;
return Success();
}
/**
* \brief Receive data, without error then all data should be received.
* @brief Receive data, without error then all data should be received.
*/
[[nodiscard]] auto RecvAll(void *buf, std::size_t len) {
[[nodiscard]] Result RecvAll(void *buf, std::size_t len, std::size_t *n_recv) {
char *_buf = reinterpret_cast<char *>(buf);
std::size_t ndone = 0;
std::size_t &ndone = *n_recv;
ndone = 0;
while (ndone < len) {
ssize_t ret = recv(handle_, _buf, len - ndone, MSG_WAITALL);
if (ret == -1) {
if (system::LastErrorWouldBlock()) {
return ndone;
return Success();
}
system::ThrowAtError("recv");
return system::FailWithCode("recv");
}
if (ret == 0) {
return ndone;
return Success();
}
_buf += ret;
ndone += ret;
}
return ndone;
return Success();
}
/**
* \brief Send data using the socket