bug fixed ver
This commit is contained in:
parent
a186f8c3aa
commit
8a6768763d
@ -56,6 +56,7 @@ void AllreduceBase::Shutdown(void) {
|
||||
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
||||
master.SendStr(job_id);
|
||||
master.SendStr(std::string("shutdown"));
|
||||
master.Close();
|
||||
utils::TCPSocket::Finalize();
|
||||
}
|
||||
/*!
|
||||
@ -102,7 +103,6 @@ void AllreduceBase::ReConnectLinks(void) {
|
||||
utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
||||
utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
||||
utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
|
||||
|
||||
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
||||
master.SendStr(job_id);
|
||||
master.SendStr(std::string("start"));
|
||||
@ -112,10 +112,11 @@ void AllreduceBase::ReConnectLinks(void) {
|
||||
"ReConnectLink failure 4");
|
||||
utils::Assert(master.RecvAll(&parent_rank, sizeof(parent_rank)) == sizeof(parent_rank),
|
||||
"ReConnectLink failure 4");
|
||||
utils::Assert(master.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
|
||||
"ReConnectLink failure 4");
|
||||
utils::Assert(rank == -1 || newrank == rank, "must keep rank to same if the node already have one");
|
||||
rank = newrank;
|
||||
}
|
||||
|
||||
// create listening socket
|
||||
utils::TCPSocket sock_listen;
|
||||
sock_listen.Create();
|
||||
@ -125,7 +126,6 @@ void AllreduceBase::ReConnectLinks(void) {
|
||||
|
||||
// get number of to connect and number of to accept nodes from master
|
||||
int num_conn, num_accept, num_error = 1;
|
||||
|
||||
do {
|
||||
// send over good links
|
||||
std::vector<int> good_link;
|
||||
@ -202,7 +202,6 @@ void AllreduceBase::ReConnectLinks(void) {
|
||||
links[i].sock.SetNonBlock(true);
|
||||
if (links[i].rank == parent_rank) parent_index = static_cast<int>(i);
|
||||
}
|
||||
utils::LogPrintf("[%d] parent_rank=%d, parent_index=%d, nlink=%d\n", rank, parent_rank, parent_index, (int)links.size());
|
||||
if (parent_rank != -1) {
|
||||
utils::Assert(parent_index != -1, "cannot find parent in the link");
|
||||
}
|
||||
|
||||
@ -80,7 +80,7 @@ class AllreduceBase : public IEngine {
|
||||
*/
|
||||
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
|
||||
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
|
||||
"Allreduce failed");
|
||||
"Broadcast failed");
|
||||
}
|
||||
/*!
|
||||
* \brief load latest check point
|
||||
|
||||
@ -103,7 +103,7 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
|
||||
sendrecv_data->resize(size);
|
||||
}
|
||||
if (size != 0) {
|
||||
Broadcast(&sendrecv_data[0], size * sizeof(DType), root);
|
||||
Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
|
||||
}
|
||||
}
|
||||
inline void Broadcast(std::string *sendrecv_data, int root) {
|
||||
@ -113,7 +113,7 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
|
||||
sendrecv_data->resize(size);
|
||||
}
|
||||
if (size != 0) {
|
||||
Broadcast(&sendrecv_data[0], size * sizeof(char), root);
|
||||
Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -77,6 +77,8 @@ class SlaveEntry:
|
||||
self.sock.sendint(rank)
|
||||
# send parent rank
|
||||
self.sock.sendint((rank + 1) / 2 - 1)
|
||||
# send world size
|
||||
self.sock.sendint(nslave)
|
||||
while True:
|
||||
ngood = self.sock.recvint()
|
||||
goodset = set([])
|
||||
@ -88,8 +90,6 @@ class SlaveEntry:
|
||||
for r in badset:
|
||||
if r in wait_conn:
|
||||
conset.append(r)
|
||||
print 'rank=%d' % rank
|
||||
print 'conset=%s' % str(conset)
|
||||
self.sock.sendint(len(conset))
|
||||
self.sock.sendint(len(badset) - len(conset))
|
||||
for r in conset:
|
||||
@ -109,7 +109,6 @@ class SlaveEntry:
|
||||
for r in rmset:
|
||||
wait_conn.pop(r, None)
|
||||
self.wait_accept = len(badset) - len(conset)
|
||||
print 'wait=%d' % self.wait_accept
|
||||
return rmset
|
||||
|
||||
class Master:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user