Revamp the rabit implementation. (#10112)
This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features: - Federated learning for both CPU and GPU. - NCCL. - More data types. - A unified interface for all the underlying implementations. - Improved timeout handling for both tracker and workers. - Exhausted tests with metrics (fixed a couple of bugs along the way). - A reusable tracker for Python and JVM packages.
This commit is contained in:
@@ -55,10 +55,9 @@ struct ResultImpl {
|
||||
#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__MINGW32__)
|
||||
#define __builtin_FILE() nullptr
|
||||
#define __builtin_LINE() (-1)
|
||||
std::string MakeMsg(std::string&& msg, char const*, std::int32_t);
|
||||
#else
|
||||
std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line);
|
||||
#endif
|
||||
|
||||
std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line);
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
|
||||
@@ -16,6 +16,10 @@
|
||||
#include <system_error> // std::error_code, std::system_category
|
||||
#include <utility> // std::swap
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/ioctl.h> // for TIOCOUTQ, FIONREAD
|
||||
#endif // defined(__linux__)
|
||||
|
||||
#if !defined(xgboost_IS_MINGW)
|
||||
|
||||
#if defined(__MINGW32__)
|
||||
@@ -319,7 +323,8 @@ class TCPSocket {
|
||||
std::int32_t domain;
|
||||
socklen_t len = sizeof(domain);
|
||||
xgboost_CHECK_SYS_CALL(
|
||||
getsockopt(handle_, SOL_SOCKET, SO_DOMAIN, reinterpret_cast<char *>(&domain), &len), 0);
|
||||
getsockopt(this->Handle(), SOL_SOCKET, SO_DOMAIN, reinterpret_cast<char *>(&domain), &len),
|
||||
0);
|
||||
return ret_iafamily(domain);
|
||||
#else
|
||||
struct sockaddr sa;
|
||||
@@ -426,6 +431,35 @@ class TCPSocket {
|
||||
return Success();
|
||||
}
|
||||
|
||||
[[nodiscard]] Result SendBufSize(std::int32_t *n_bytes) {
|
||||
socklen_t optlen;
|
||||
auto rc = getsockopt(this->Handle(), SOL_SOCKET, SO_SNDBUF, reinterpret_cast<char *>(n_bytes),
|
||||
&optlen);
|
||||
if (rc != 0 || optlen != sizeof(std::int32_t)) {
|
||||
return system::FailWithCode("getsockopt");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
[[nodiscard]] Result RecvBufSize(std::int32_t *n_bytes) {
|
||||
socklen_t optlen;
|
||||
auto rc = getsockopt(this->Handle(), SOL_SOCKET, SO_RCVBUF, reinterpret_cast<char *>(n_bytes),
|
||||
&optlen);
|
||||
if (rc != 0 || optlen != sizeof(std::int32_t)) {
|
||||
return system::FailWithCode("getsockopt");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
#if defined(__linux__)
|
||||
[[nodiscard]] Result PendingSendSize(std::int32_t *n_bytes) const {
|
||||
return ioctl(this->Handle(), TIOCOUTQ, n_bytes) == 0 ? Success()
|
||||
: system::FailWithCode("ioctl");
|
||||
}
|
||||
[[nodiscard]] Result PendingRecvSize(std::int32_t *n_bytes) const {
|
||||
return ioctl(this->Handle(), FIONREAD, n_bytes) == 0 ? Success()
|
||||
: system::FailWithCode("ioctl");
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
|
||||
[[nodiscard]] Result SetKeepAlive() {
|
||||
std::int32_t keepalive = 1;
|
||||
auto rc = setsockopt(handle_, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char *>(&keepalive),
|
||||
@@ -436,10 +470,9 @@ class TCPSocket {
|
||||
return Success();
|
||||
}
|
||||
|
||||
[[nodiscard]] Result SetNoDelay() {
|
||||
std::int32_t tcp_no_delay = 1;
|
||||
auto rc = setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&tcp_no_delay),
|
||||
sizeof(tcp_no_delay));
|
||||
[[nodiscard]] Result SetNoDelay(std::int32_t no_delay = 1) {
|
||||
auto rc = setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&no_delay),
|
||||
sizeof(no_delay));
|
||||
if (rc != 0) {
|
||||
return system::FailWithCode("Failed to set TCP no delay.");
|
||||
}
|
||||
@@ -602,45 +635,47 @@ class TCPSocket {
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Send data, without error then all data should be sent.
|
||||
* @brief Send data, without error then all data should be sent.
|
||||
*/
|
||||
[[nodiscard]] auto SendAll(void const *buf, std::size_t len) {
|
||||
[[nodiscard]] Result SendAll(void const *buf, std::size_t len, std::size_t *n_sent) {
|
||||
char const *_buf = reinterpret_cast<const char *>(buf);
|
||||
std::size_t ndone = 0;
|
||||
std::size_t &ndone = *n_sent;
|
||||
ndone = 0;
|
||||
while (ndone < len) {
|
||||
ssize_t ret = send(handle_, _buf, len - ndone, 0);
|
||||
if (ret == -1) {
|
||||
if (system::LastErrorWouldBlock()) {
|
||||
return ndone;
|
||||
return Success();
|
||||
}
|
||||
system::ThrowAtError("send");
|
||||
return system::FailWithCode("send");
|
||||
}
|
||||
_buf += ret;
|
||||
ndone += ret;
|
||||
}
|
||||
return ndone;
|
||||
return Success();
|
||||
}
|
||||
/**
|
||||
* \brief Receive data, without error then all data should be received.
|
||||
* @brief Receive data, without error then all data should be received.
|
||||
*/
|
||||
[[nodiscard]] auto RecvAll(void *buf, std::size_t len) {
|
||||
[[nodiscard]] Result RecvAll(void *buf, std::size_t len, std::size_t *n_recv) {
|
||||
char *_buf = reinterpret_cast<char *>(buf);
|
||||
std::size_t ndone = 0;
|
||||
std::size_t &ndone = *n_recv;
|
||||
ndone = 0;
|
||||
while (ndone < len) {
|
||||
ssize_t ret = recv(handle_, _buf, len - ndone, MSG_WAITALL);
|
||||
if (ret == -1) {
|
||||
if (system::LastErrorWouldBlock()) {
|
||||
return ndone;
|
||||
return Success();
|
||||
}
|
||||
system::ThrowAtError("recv");
|
||||
return system::FailWithCode("recv");
|
||||
}
|
||||
if (ret == 0) {
|
||||
return ndone;
|
||||
return Success();
|
||||
}
|
||||
_buf += ret;
|
||||
ndone += ret;
|
||||
}
|
||||
return ndone;
|
||||
return Success();
|
||||
}
|
||||
/**
|
||||
* \brief Send data using the socket
|
||||
|
||||
Reference in New Issue
Block a user