bug fixed ver

This commit is contained in:
tqchen 2014-12-03 11:51:39 -08:00
parent a186f8c3aa
commit 8a6768763d
5 changed files with 12 additions and 14 deletions

View File

@ -56,6 +56,7 @@ void AllreduceBase::Shutdown(void) {
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
master.SendStr(job_id);
master.SendStr(std::string("shutdown"));
master.Close();
utils::TCPSocket::Finalize();
}
/*!
@ -102,7 +103,6 @@ void AllreduceBase::ReConnectLinks(void) {
utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
master.SendStr(job_id);
master.SendStr(std::string("start"));
@ -112,10 +112,11 @@ void AllreduceBase::ReConnectLinks(void) {
"ReConnectLink failure 4");
utils::Assert(master.RecvAll(&parent_rank, sizeof(parent_rank)) == sizeof(parent_rank),
"ReConnectLink failure 4");
utils::Assert(master.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
"ReConnectLink failure 4");
utils::Assert(rank == -1 || newrank == rank, "must keep rank to same if the node already have one");
rank = newrank;
}
// create listening socket
utils::TCPSocket sock_listen;
sock_listen.Create();
@ -125,7 +126,6 @@ void AllreduceBase::ReConnectLinks(void) {
// get number of to connect and number of to accept nodes from master
int num_conn, num_accept, num_error = 1;
do {
// send over good links
std::vector<int> good_link;
@ -202,7 +202,6 @@ void AllreduceBase::ReConnectLinks(void) {
links[i].sock.SetNonBlock(true);
if (links[i].rank == parent_rank) parent_index = static_cast<int>(i);
}
utils::LogPrintf("[%d] parent_rank=%d, parent_index=%d, nlink=%d\n", rank, parent_rank, parent_index, (int)links.size());
if (parent_rank != -1) {
utils::Assert(parent_index != -1, "cannot find parent in the link");
}

View File

@ -80,7 +80,7 @@ class AllreduceBase : public IEngine {
*/
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
"Allreduce failed");
"Broadcast failed");
}
/*!
* \brief load latest check point

View File

@ -103,7 +103,7 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
sendrecv_data->resize(size);
}
if (size != 0) {
Broadcast(&sendrecv_data[0], size * sizeof(DType), root);
Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
}
}
inline void Broadcast(std::string *sendrecv_data, int root) {
@ -113,7 +113,7 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
sendrecv_data->resize(size);
}
if (size != 0) {
Broadcast(&sendrecv_data[0], size * sizeof(char), root);
Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
}
}

View File

@ -77,6 +77,8 @@ class SlaveEntry:
self.sock.sendint(rank)
# send parent rank
self.sock.sendint((rank + 1) / 2 - 1)
# send world size
self.sock.sendint(nslave)
while True:
ngood = self.sock.recvint()
goodset = set([])
@ -88,8 +90,6 @@ class SlaveEntry:
for r in badset:
if r in wait_conn:
conset.append(r)
print 'rank=%d' % rank
print 'conset=%s' % str(conset)
self.sock.sendint(len(conset))
self.sock.sendint(len(badset) - len(conset))
for r in conset:
@ -109,7 +109,6 @@ class SlaveEntry:
for r in rmset:
wait_conn.pop(r, None)
self.wait_accept = len(badset) - len(conset)
print 'wait=%d' % self.wait_accept
return rmset
class Master: