bug fixed ver
This commit is contained in:
parent
a186f8c3aa
commit
8a6768763d
@ -55,7 +55,8 @@ void AllreduceBase::Shutdown(void) {
|
|||||||
|
|
||||||
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
||||||
master.SendStr(job_id);
|
master.SendStr(job_id);
|
||||||
master.SendStr(std::string("shutdown"));
|
master.SendStr(std::string("shutdown"));
|
||||||
|
master.Close();
|
||||||
utils::TCPSocket::Finalize();
|
utils::TCPSocket::Finalize();
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -102,7 +103,6 @@ void AllreduceBase::ReConnectLinks(void) {
|
|||||||
utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
||||||
utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
||||||
utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
|
utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
|
||||||
|
|
||||||
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
||||||
master.SendStr(job_id);
|
master.SendStr(job_id);
|
||||||
master.SendStr(std::string("start"));
|
master.SendStr(std::string("start"));
|
||||||
@ -112,10 +112,11 @@ void AllreduceBase::ReConnectLinks(void) {
|
|||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
utils::Assert(master.RecvAll(&parent_rank, sizeof(parent_rank)) == sizeof(parent_rank),
|
utils::Assert(master.RecvAll(&parent_rank, sizeof(parent_rank)) == sizeof(parent_rank),
|
||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
|
utils::Assert(master.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
|
||||||
|
"ReConnectLink failure 4");
|
||||||
utils::Assert(rank == -1 || newrank == rank, "must keep rank to same if the node already have one");
|
utils::Assert(rank == -1 || newrank == rank, "must keep rank to same if the node already have one");
|
||||||
rank = newrank;
|
rank = newrank;
|
||||||
}
|
}
|
||||||
|
|
||||||
// create listening socket
|
// create listening socket
|
||||||
utils::TCPSocket sock_listen;
|
utils::TCPSocket sock_listen;
|
||||||
sock_listen.Create();
|
sock_listen.Create();
|
||||||
@ -125,7 +126,6 @@ void AllreduceBase::ReConnectLinks(void) {
|
|||||||
|
|
||||||
// get number of to connect and number of to accept nodes from master
|
// get number of to connect and number of to accept nodes from master
|
||||||
int num_conn, num_accept, num_error = 1;
|
int num_conn, num_accept, num_error = 1;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
// send over good links
|
// send over good links
|
||||||
std::vector<int> good_link;
|
std::vector<int> good_link;
|
||||||
@ -146,7 +146,7 @@ void AllreduceBase::ReConnectLinks(void) {
|
|||||||
utils::Assert(master.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
|
utils::Assert(master.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
|
||||||
"ReConnectLink failure 7");
|
"ReConnectLink failure 7");
|
||||||
utils::Assert(master.RecvAll(&num_accept, sizeof(num_accept)) == sizeof(num_accept),
|
utils::Assert(master.RecvAll(&num_accept, sizeof(num_accept)) == sizeof(num_accept),
|
||||||
"ReConnectLink failure 8");
|
"ReConnectLink failure 8");
|
||||||
num_error = 0;
|
num_error = 0;
|
||||||
for (int i = 0; i < num_conn; ++i) {
|
for (int i = 0; i < num_conn; ++i) {
|
||||||
LinkRecord r;
|
LinkRecord r;
|
||||||
@ -202,7 +202,6 @@ void AllreduceBase::ReConnectLinks(void) {
|
|||||||
links[i].sock.SetNonBlock(true);
|
links[i].sock.SetNonBlock(true);
|
||||||
if (links[i].rank == parent_rank) parent_index = static_cast<int>(i);
|
if (links[i].rank == parent_rank) parent_index = static_cast<int>(i);
|
||||||
}
|
}
|
||||||
utils::LogPrintf("[%d] parent_rank=%d, parent_index=%d, nlink=%d\n", rank, parent_rank, parent_index, (int)links.size());
|
|
||||||
if (parent_rank != -1) {
|
if (parent_rank != -1) {
|
||||||
utils::Assert(parent_index != -1, "cannot find parent in the link");
|
utils::Assert(parent_index != -1, "cannot find parent in the link");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -80,7 +80,7 @@ class AllreduceBase : public IEngine {
|
|||||||
*/
|
*/
|
||||||
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
|
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
|
||||||
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
|
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
|
||||||
"Allreduce failed");
|
"Broadcast failed");
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief load latest check point
|
* \brief load latest check point
|
||||||
|
|||||||
@ -103,7 +103,7 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
|
|||||||
sendrecv_data->resize(size);
|
sendrecv_data->resize(size);
|
||||||
}
|
}
|
||||||
if (size != 0) {
|
if (size != 0) {
|
||||||
Broadcast(&sendrecv_data[0], size * sizeof(DType), root);
|
Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
inline void Broadcast(std::string *sendrecv_data, int root) {
|
inline void Broadcast(std::string *sendrecv_data, int root) {
|
||||||
@ -113,7 +113,7 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
|
|||||||
sendrecv_data->resize(size);
|
sendrecv_data->resize(size);
|
||||||
}
|
}
|
||||||
if (size != 0) {
|
if (size != 0) {
|
||||||
Broadcast(&sendrecv_data[0], size * sizeof(char), root);
|
Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -77,6 +77,8 @@ class SlaveEntry:
|
|||||||
self.sock.sendint(rank)
|
self.sock.sendint(rank)
|
||||||
# send parent rank
|
# send parent rank
|
||||||
self.sock.sendint((rank + 1) / 2 - 1)
|
self.sock.sendint((rank + 1) / 2 - 1)
|
||||||
|
# send world size
|
||||||
|
self.sock.sendint(nslave)
|
||||||
while True:
|
while True:
|
||||||
ngood = self.sock.recvint()
|
ngood = self.sock.recvint()
|
||||||
goodset = set([])
|
goodset = set([])
|
||||||
@ -88,8 +90,6 @@ class SlaveEntry:
|
|||||||
for r in badset:
|
for r in badset:
|
||||||
if r in wait_conn:
|
if r in wait_conn:
|
||||||
conset.append(r)
|
conset.append(r)
|
||||||
print 'rank=%d' % rank
|
|
||||||
print 'conset=%s' % str(conset)
|
|
||||||
self.sock.sendint(len(conset))
|
self.sock.sendint(len(conset))
|
||||||
self.sock.sendint(len(badset) - len(conset))
|
self.sock.sendint(len(badset) - len(conset))
|
||||||
for r in conset:
|
for r in conset:
|
||||||
@ -109,7 +109,6 @@ class SlaveEntry:
|
|||||||
for r in rmset:
|
for r in rmset:
|
||||||
wait_conn.pop(r, None)
|
wait_conn.pop(r, None)
|
||||||
self.wait_accept = len(badset) - len(conset)
|
self.wait_accept = len(badset) - len(conset)
|
||||||
print 'wait=%d' % self.wait_accept
|
|
||||||
return rmset
|
return rmset
|
||||||
|
|
||||||
class Master:
|
class Master:
|
||||||
|
|||||||
@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
|
|||||||
TestSum(mock, n);
|
TestSum(mock, n);
|
||||||
utils::LogPrintf("[%d] !!!TestSum pass\n", rank);
|
utils::LogPrintf("[%d] !!!TestSum pass\n", rank);
|
||||||
int step = std::max(nproc / 3, 1);
|
int step = std::max(nproc / 3, 1);
|
||||||
for (int i = 0; i < nproc; i += step) {
|
for (int i = 0; i < nproc; i += step) {
|
||||||
TestBcast(mock, n, i);
|
TestBcast(mock, n, i);
|
||||||
}
|
}
|
||||||
utils::LogPrintf("[%d] !!!TestBcast pass\n", rank);
|
utils::LogPrintf("[%d] !!!TestBcast pass\n", rank);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user