add recover data, do a round of review
This commit is contained in:
parent
b9b58a1275
commit
ecb09a23bc
@ -5,7 +5,7 @@
|
|||||||
* \brief This file defines a template wrapper of engine to give more flexible
|
* \brief This file defines a template wrapper of engine to give more flexible
|
||||||
* AllReduce operations
|
* AllReduce operations
|
||||||
*
|
*
|
||||||
* \author Tianqi Chen, Nacho, Tianyi
|
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
|
||||||
*/
|
*/
|
||||||
#include "./engine.h"
|
#include "./engine.h"
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
* \brief this file governs which implementation of engine we are actually using
|
* \brief this file governs which implementation of engine we are actually using
|
||||||
* provides an singleton of engine interface
|
* provides an singleton of engine interface
|
||||||
*
|
*
|
||||||
* \author Tianqi, Nacho, Tianyi
|
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
|
||||||
*/
|
*/
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#define _CRT_SECURE_NO_DEPRECATE
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
/*!
|
/*!
|
||||||
* \file engine_base.cc
|
* \file engine_base.cc
|
||||||
* \brief Basic implementation of AllReduce
|
* \brief Basic implementation of AllReduce
|
||||||
* \author Tianqi, Nacho, Tianyi
|
*
|
||||||
|
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
|
||||||
*/
|
*/
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#define _CRT_SECURE_NO_DEPRECATE
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
* This implementation provides basic utility of AllReduce and Broadcast
|
* This implementation provides basic utility of AllReduce and Broadcast
|
||||||
* without considering node failure
|
* without considering node failure
|
||||||
*
|
*
|
||||||
* \author Tianqi, Nacho, Tianyi
|
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
|
||||||
*/
|
*/
|
||||||
#ifndef ALLREDUCE_ENGINE_BASE_H
|
#ifndef ALLREDUCE_ENGINE_BASE_H
|
||||||
#define ALLREDUCE_ENGINE_BASE_H
|
#define ALLREDUCE_ENGINE_BASE_H
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
* \file engine_robust-inl.h
|
* \file engine_robust-inl.h
|
||||||
* \brief implementation of inline template function in AllReduceRobust
|
* \brief implementation of inline template function in AllReduceRobust
|
||||||
*
|
*
|
||||||
* \author Tianqi, Nacho, Tianyi
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
#ifndef ALLREDUCE_ENGINE_ROBUST_INL_H
|
#ifndef ALLREDUCE_ENGINE_ROBUST_INL_H
|
||||||
#define ALLREDUCE_ENGINE_ROBUST_INL_H
|
#define ALLREDUCE_ENGINE_ROBUST_INL_H
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
/*!
|
/*!
|
||||||
* \file engine_robust.cc
|
* \file engine_robust.cc
|
||||||
* \brief Robust implementation of AllReduce
|
* \brief Robust implementation of AllReduce
|
||||||
* \author Tianqi, Nacho, Tianyi
|
*
|
||||||
|
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
|
||||||
*/
|
*/
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#define _CRT_SECURE_NO_DEPRECATE
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
@ -272,24 +273,22 @@ inline char DataRequest(const std::pair<bool, int> &node_value,
|
|||||||
/*!
|
/*!
|
||||||
* \brief try to decide the recovery message passing request
|
* \brief try to decide the recovery message passing request
|
||||||
* \param role the current role of the node
|
* \param role the current role of the node
|
||||||
* \param p_req_outlink used to store the output link the
|
|
||||||
* current node should recv data from,
|
|
||||||
* this can be -1 or -2,
|
|
||||||
* -1 means current node have the data
|
|
||||||
* -2 means current node do not have data, but also do not need to send/recv data
|
|
||||||
* \param p_req_in used to store the resulting vector, indicating which link we should send the data to
|
|
||||||
* \param p_size used to store the size of the message, for node in state kHaveData,
|
* \param p_size used to store the size of the message, for node in state kHaveData,
|
||||||
* this size must be set correctly before calling the function
|
* this size must be set correctly before calling the function
|
||||||
* for others, this surves as output parameter
|
* for others, this surves as output parameter
|
||||||
*
|
*
|
||||||
|
* \param p_recvlink used to store the link current node should recv data from, if necessary
|
||||||
|
* this can be -1, which means current node have the data
|
||||||
|
* \param p_req_in used to store the resulting vector, indicating which link we should send the data to
|
||||||
|
*
|
||||||
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
* \sa ReturnType
|
* \sa ReturnType
|
||||||
*/
|
*/
|
||||||
AllReduceRobust::ReturnType
|
AllReduceRobust::ReturnType
|
||||||
AllReduceRobust::TryDecideRequest(AllReduceRobust::RecoverType role,
|
AllReduceRobust::TryDecideRouting(AllReduceRobust::RecoverType role,
|
||||||
int *p_req_outlink,
|
size_t *p_size,
|
||||||
std::vector<bool> *p_req_in,
|
int *p_recvlink,
|
||||||
size_t *p_size) {
|
std::vector<bool> *p_req_in) {
|
||||||
int best_link = -2;
|
int best_link = -2;
|
||||||
{// get the shortest distance to the request point
|
{// get the shortest distance to the request point
|
||||||
std::vector< std::pair<int,size_t> > dist_in, dist_out;
|
std::vector< std::pair<int,size_t> > dist_in, dist_out;
|
||||||
@ -317,7 +316,6 @@ AllReduceRobust::TryDecideRequest(AllReduceRobust::RecoverType role,
|
|||||||
ReturnType succ = MsgPassing(std::make_pair(role == kRequestData, best_link),
|
ReturnType succ = MsgPassing(std::make_pair(role == kRequestData, best_link),
|
||||||
&req_in, &req_out, DataRequest);
|
&req_in, &req_out, DataRequest);
|
||||||
if (succ != kSuccess) return succ;
|
if (succ != kSuccess) return succ;
|
||||||
bool need_recv = false;
|
|
||||||
// set p_req_in
|
// set p_req_in
|
||||||
p_req_in->resize(req_in.size());
|
p_req_in->resize(req_in.size());
|
||||||
for (size_t i = 0; i < req_in.size(); ++i) {
|
for (size_t i = 0; i < req_in.size(); ++i) {
|
||||||
@ -326,16 +324,115 @@ AllReduceRobust::TryDecideRequest(AllReduceRobust::RecoverType role,
|
|||||||
if (req_out[i] != 0) {
|
if (req_out[i] != 0) {
|
||||||
utils::Assert(req_in[i] == 0, "cannot get and receive request");
|
utils::Assert(req_in[i] == 0, "cannot get and receive request");
|
||||||
utils::Assert(static_cast<int>(i) == best_link, "request result inconsistent");
|
utils::Assert(static_cast<int>(i) == best_link, "request result inconsistent");
|
||||||
need_recv = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (role == kPassData && !need_recv) {
|
*p_recvlink = best_link;
|
||||||
for (size_t i = 0; i < req_in.size(); ++i) {
|
return kSuccess;
|
||||||
utils::Assert(req_in[i] == 0, "Bug in TryDecideRequest");
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief try to finish the data recovery request,
|
||||||
|
* this function is used together with TryDecideRouting
|
||||||
|
* \param role the current role of the node
|
||||||
|
* \param sendrecvbuf_ the buffer to store the data to be sent/recived
|
||||||
|
* - if the role is kHaveData, this stores the data to be sent
|
||||||
|
* - if the role is kRequestData, this is the buffer to store the result
|
||||||
|
* - if the role is kPassData, this will not be used, and can be NULL
|
||||||
|
* \param size the size of the data, obtained from TryDecideRouting
|
||||||
|
* \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
|
||||||
|
* \param req_in the request of each link to send data, obtained from TryDecideRouting
|
||||||
|
*
|
||||||
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType, TryDecideRouting
|
||||||
|
*/
|
||||||
|
AllReduceRobust::ReturnType
|
||||||
|
AllReduceRobust::TryRecoverData(RecoverType role,
|
||||||
|
void *sendrecvbuf_,
|
||||||
|
size_t size,
|
||||||
|
int recv_link,
|
||||||
|
const std::vector<bool> &req_in) {
|
||||||
|
// no need to run recovery for zero size message
|
||||||
|
if (size == 0) return kSuccess;
|
||||||
|
utils::Assert(req_in.size() == links.size(), "TryRecoverData");
|
||||||
|
const int nlink = static_cast<int>(links.size());
|
||||||
|
{
|
||||||
|
bool req_data = role == kRequestData;
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (req_in[i]) {
|
||||||
|
utils::Assert(i != recv_link, "TryDecideRouting");
|
||||||
|
req_data = true;
|
||||||
}
|
}
|
||||||
*p_req_outlink = -2;
|
}
|
||||||
|
// do not need to provide data or receive data, directly exit
|
||||||
|
if (!req_data) return kSuccess;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
links[i].ResetSize();
|
||||||
|
}
|
||||||
|
utils::Assert(recv_link >= 0 || role == kHaveData, "recv_link must be active");
|
||||||
|
if (role == kPassData) {
|
||||||
|
links[recv_link].InitBuffer(1, size, reduce_buffer_size);
|
||||||
|
}
|
||||||
|
while (true) {
|
||||||
|
bool finished = true;
|
||||||
|
utils::SelectHelper selecter;
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (i == recv_link && links[i].size_read != size) {
|
||||||
|
selecter.WatchRead(links[i].sock);
|
||||||
|
finished = false;
|
||||||
|
}
|
||||||
|
if (req_in[i] && links[i].size_write != size) {
|
||||||
|
selecter.WatchWrite(links[i].sock);
|
||||||
|
finished = false;
|
||||||
|
}
|
||||||
|
selecter.WatchException(links[i].sock);
|
||||||
|
}
|
||||||
|
if (finished) break;
|
||||||
|
selecter.Select();
|
||||||
|
if (role == kRequestData) {
|
||||||
|
const int pid = recv_link;
|
||||||
|
if (selecter.CheckRead(links[pid].sock)) {
|
||||||
|
if(!links[pid].ReadToArray(sendrecvbuf_, size)) return kSockError;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (req_in[i] && links[i].size_write != links[pid].size_read &&
|
||||||
|
selecter.CheckWrite(links[i].sock)) {
|
||||||
|
if(!links[i].WriteFromArray(sendrecvbuf_, links[pid].size_read)) return kSockError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (role == kHaveData) {
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (req_in[i] && selecter.CheckWrite(links[i].sock)) {
|
||||||
|
if(!links[i].WriteFromArray(sendrecvbuf_, size)) return kSockError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (role == kPassData) {
|
||||||
|
const int pid = recv_link;
|
||||||
|
const size_t buffer_size = links[pid].buffer_size;
|
||||||
|
if (selecter.CheckRead(links[pid].sock)) {
|
||||||
|
size_t min_write = size;
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (req_in[i]) min_write = std::min(links[i].size_write, min_write);
|
||||||
|
}
|
||||||
|
utils::Assert(min_write <= links[pid].size_read, "boundary check");
|
||||||
|
if (!links[pid].ReadToRingBuffer(min_write)) return kSockError;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nlink; ++i) {
|
||||||
|
if (req_in[i] && selecter.CheckWrite(links[i].sock)) {
|
||||||
|
size_t start = links[i].size_write % buffer_size;
|
||||||
|
// send out data from ring buffer
|
||||||
|
size_t nwrite = std::min(buffer_size - start, links[pid].size_read - links[i].size_write);
|
||||||
|
ssize_t len = links[pid].sock.Send(links[pid].buffer_head + start, nwrite);
|
||||||
|
if (len != -1) {
|
||||||
|
links[i].size_write += len;
|
||||||
} else {
|
} else {
|
||||||
*p_req_outlink = best_link;
|
if (errno != EAGAIN && errno != EWOULDBLOCK) return kSockError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return kSuccess;
|
return kSuccess;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
*
|
*
|
||||||
* This implementation considers the failure of nodes
|
* This implementation considers the failure of nodes
|
||||||
*
|
*
|
||||||
* \author Tianqi, Nacho, Tianyi
|
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
|
||||||
*/
|
*/
|
||||||
#ifndef ALLREDUCE_ENGINE_ROBUST_H
|
#ifndef ALLREDUCE_ENGINE_ROBUST_H
|
||||||
#define ALLREDUCE_ENGINE_ROBUST_H
|
#define ALLREDUCE_ENGINE_ROBUST_H
|
||||||
@ -257,25 +257,43 @@ class AllReduceRobust : public AllReduceBase {
|
|||||||
*/
|
*/
|
||||||
ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
|
ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
|
||||||
/*!
|
/*!
|
||||||
* \brief try to decide the recovery message passing request
|
* \brief try to decide the routing strategy for recovery
|
||||||
* \param role the current role of the node
|
* \param role the current role of the node
|
||||||
* \param p_req_outlink used to store the output link the
|
|
||||||
* current node should recv data from,
|
|
||||||
* this can be nonnegative value, -1 or -2,
|
|
||||||
* -1 means current node have the data
|
|
||||||
* -2 means current node do not have data, but also do not need to send/recv data
|
|
||||||
* \param p_req_in used to store the resulting vector, indicating which link we should send the data to
|
|
||||||
* \param p_size used to store the size of the message, for node in state kHaveData,
|
* \param p_size used to store the size of the message, for node in state kHaveData,
|
||||||
* this size must be set correctly before calling the function
|
* this size must be set correctly before calling the function
|
||||||
* for others, this surves as output parameter
|
* for others, this surves as output parameter
|
||||||
|
|
||||||
|
* \param p_recvlink used to store the link current node should recv data from, if necessary
|
||||||
|
* this can be -1, which means current node have the data
|
||||||
|
* \param p_req_in used to store the resulting vector, indicating which link we should send the data to
|
||||||
*
|
*
|
||||||
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
* \sa ReturnType
|
* \sa ReturnType, TryRecoverData
|
||||||
*/
|
*/
|
||||||
ReturnType TryDecideRequest(RecoverType role,
|
ReturnType TryDecideRouting(RecoverType role,
|
||||||
int *p_req_outlink,
|
size_t *p_size,
|
||||||
std::vector<bool> *p_req_in,
|
int *p_recvlink,
|
||||||
size_t *p_size);
|
std::vector<bool> *p_req_in);
|
||||||
|
/*!
|
||||||
|
* \brief try to finish the data recovery request,
|
||||||
|
* this function is used together with TryDecideRouting
|
||||||
|
* \param role the current role of the node
|
||||||
|
* \param sendrecvbuf_ the buffer to store the data to be sent/recived
|
||||||
|
* - if the role is kHaveData, this stores the data to be sent
|
||||||
|
* - if the role is kRequestData, this is the buffer to store the result
|
||||||
|
* - if the role is kPassData, this will not be used, and can be NULL
|
||||||
|
* \param size the size of the data, obtained from TryDecideRouting
|
||||||
|
* \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
|
||||||
|
* \param req_in the request of each link to send data, obtained from TryDecideRouting
|
||||||
|
*
|
||||||
|
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
|
||||||
|
* \sa ReturnType, TryDecideRouting
|
||||||
|
*/
|
||||||
|
ReturnType TryRecoverData(RecoverType role,
|
||||||
|
void *sendrecvbuf_,
|
||||||
|
size_t size,
|
||||||
|
int recv_link,
|
||||||
|
const std::vector<bool> &req_in);
|
||||||
/*!
|
/*!
|
||||||
* \brief run message passing algorithm on the allreduce tree
|
* \brief run message passing algorithm on the allreduce tree
|
||||||
* the result is edge message stored in p_edge_in and p_edge_out
|
* the result is edge message stored in p_edge_in and p_edge_out
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user