more tracker renaming
This commit is contained in:
parent
a569bf2698
commit
19631ecef6
@ -15,8 +15,8 @@ namespace rabit {
|
|||||||
namespace engine {
|
namespace engine {
|
||||||
// constructor
|
// constructor
|
||||||
AllreduceBase::AllreduceBase(void) {
|
AllreduceBase::AllreduceBase(void) {
|
||||||
master_uri = "NULL";
|
tracker_uri = "NULL";
|
||||||
master_port = 9000;
|
tracker_port = 9000;
|
||||||
host_uri = "";
|
host_uri = "";
|
||||||
slave_port = 9010;
|
slave_port = 9010;
|
||||||
nport_trial = 1000;
|
nport_trial = 1000;
|
||||||
@ -45,7 +45,7 @@ void AllreduceBase::Init(void) {
|
|||||||
utils::Socket::Startup();
|
utils::Socket::Startup();
|
||||||
utils::Assert(links.size() == 0, "can only call Init once");
|
utils::Assert(links.size() == 0, "can only call Init once");
|
||||||
this->host_uri = utils::SockAddr::GetHostName();
|
this->host_uri = utils::SockAddr::GetHostName();
|
||||||
// get information from master
|
// get information from tracker
|
||||||
this->ReConnectLinks();
|
this->ReConnectLinks();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,22 +55,22 @@ void AllreduceBase::Shutdown(void) {
|
|||||||
}
|
}
|
||||||
links.clear();
|
links.clear();
|
||||||
|
|
||||||
if (master_uri == "NULL") return;
|
if (tracker_uri == "NULL") return;
|
||||||
int magic = kMagic;
|
int magic = kMagic;
|
||||||
// notify master rank i have shutdown
|
// notify tracker rank i have shutdown
|
||||||
utils::TCPSocket master;
|
utils::TCPSocket tracker;
|
||||||
master.Create();
|
tracker.Create();
|
||||||
if (!master.Connect(utils::SockAddr(master_uri.c_str(), master_port))) {
|
if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
|
||||||
utils::Socket::Error("Connect Master");
|
utils::Socket::Error("Connect Tracker");
|
||||||
}
|
}
|
||||||
utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
utils::Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
||||||
utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
utils::Assert(tracker.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
||||||
utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
|
utils::Check(magic == kMagic, "sync::Invalid tracker message, init failure");
|
||||||
|
|
||||||
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
utils::Assert(tracker.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
||||||
master.SendStr(task_id);
|
tracker.SendStr(task_id);
|
||||||
master.SendStr(std::string("shutdown"));
|
tracker.SendStr(std::string("shutdown"));
|
||||||
master.Close();
|
tracker.Close();
|
||||||
utils::TCPSocket::Finalize();
|
utils::TCPSocket::Finalize();
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -79,8 +79,8 @@ void AllreduceBase::Shutdown(void) {
|
|||||||
* \param val parameter value
|
* \param val parameter value
|
||||||
*/
|
*/
|
||||||
void AllreduceBase::SetParam(const char *name, const char *val) {
|
void AllreduceBase::SetParam(const char *name, const char *val) {
|
||||||
if (!strcmp(name, "master_uri")) master_uri = val;
|
if (!strcmp(name, "rabit_tracker_uri")) tracker_uri = val;
|
||||||
if (!strcmp(name, "master_port")) master_port = atoi(val);
|
if (!strcmp(name, "rabit_tracker_port")) tracker_port = atoi(val);
|
||||||
if (!strcmp(name, "task_id")) task_id = val;
|
if (!strcmp(name, "task_id")) task_id = val;
|
||||||
if (!strcmp(name, "hadoop_mode")) hadoop_mode = atoi(val);
|
if (!strcmp(name, "hadoop_mode")) hadoop_mode = atoi(val);
|
||||||
if (!strcmp(name, "reduce_buffer")) {
|
if (!strcmp(name, "reduce_buffer")) {
|
||||||
@ -100,34 +100,34 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief connect to the master to fix the the missing links
|
* \brief connect to the tracker to fix the the missing links
|
||||||
* this function is also used when the engine start up
|
* this function is also used when the engine start up
|
||||||
*/
|
*/
|
||||||
void AllreduceBase::ReConnectLinks(const char *cmd) {
|
void AllreduceBase::ReConnectLinks(const char *cmd) {
|
||||||
// single node mode
|
// single node mode
|
||||||
if (master_uri == "NULL") {
|
if (tracker_uri == "NULL") {
|
||||||
rank = 0; return;
|
rank = 0; return;
|
||||||
}
|
}
|
||||||
int magic = kMagic;
|
int magic = kMagic;
|
||||||
// get information from master
|
// get information from tracker
|
||||||
utils::TCPSocket master;
|
utils::TCPSocket tracker;
|
||||||
master.Create();
|
tracker.Create();
|
||||||
if (!master.Connect(utils::SockAddr(master_uri.c_str(), master_port))) {
|
if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
|
||||||
utils::Socket::Error("Connect");
|
utils::Socket::Error("Connect");
|
||||||
}
|
}
|
||||||
utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
utils::Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 1");
|
||||||
utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
utils::Assert(tracker.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "ReConnectLink failure 2");
|
||||||
utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
|
utils::Check(magic == kMagic, "sync::Invalid tracker message, init failure");
|
||||||
utils::Assert(master.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
utils::Assert(tracker.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
|
||||||
master.SendStr(task_id);
|
tracker.SendStr(task_id);
|
||||||
master.SendStr(std::string(cmd));
|
tracker.SendStr(std::string(cmd));
|
||||||
{// get new ranks
|
{// get new ranks
|
||||||
int newrank;
|
int newrank;
|
||||||
utils::Assert(master.RecvAll(&newrank, sizeof(newrank)) == sizeof(newrank),
|
utils::Assert(tracker.RecvAll(&newrank, sizeof(newrank)) == sizeof(newrank),
|
||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
utils::Assert(master.RecvAll(&parent_rank, sizeof(parent_rank)) == sizeof(parent_rank),
|
utils::Assert(tracker.RecvAll(&parent_rank, sizeof(parent_rank)) == sizeof(parent_rank),
|
||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
utils::Assert(master.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
|
utils::Assert(tracker.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
|
||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
utils::Assert(rank == -1 || newrank == rank, "must keep rank to same if the node already have one");
|
utils::Assert(rank == -1 || newrank == rank, "must keep rank to same if the node already have one");
|
||||||
rank = newrank;
|
rank = newrank;
|
||||||
@ -139,7 +139,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
|
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
|
||||||
sock_listen.Listen();
|
sock_listen.Listen();
|
||||||
|
|
||||||
// get number of to connect and number of to accept nodes from master
|
// get number of to connect and number of to accept nodes from tracker
|
||||||
int num_conn, num_accept, num_error = 1;
|
int num_conn, num_accept, num_error = 1;
|
||||||
do {
|
do {
|
||||||
// send over good links
|
// send over good links
|
||||||
@ -152,24 +152,24 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
int ngood = static_cast<int>(good_link.size());
|
int ngood = static_cast<int>(good_link.size());
|
||||||
utils::Assert(master.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
|
utils::Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
|
||||||
"ReConnectLink failure 5");
|
"ReConnectLink failure 5");
|
||||||
for (size_t i = 0; i < good_link.size(); ++i) {
|
for (size_t i = 0; i < good_link.size(); ++i) {
|
||||||
utils::Assert(master.SendAll(&good_link[i], sizeof(good_link[i])) == sizeof(good_link[i]),
|
utils::Assert(tracker.SendAll(&good_link[i], sizeof(good_link[i])) == sizeof(good_link[i]),
|
||||||
"ReConnectLink failure 6");
|
"ReConnectLink failure 6");
|
||||||
}
|
}
|
||||||
utils::Assert(master.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
|
utils::Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
|
||||||
"ReConnectLink failure 7");
|
"ReConnectLink failure 7");
|
||||||
utils::Assert(master.RecvAll(&num_accept, sizeof(num_accept)) == sizeof(num_accept),
|
utils::Assert(tracker.RecvAll(&num_accept, sizeof(num_accept)) == sizeof(num_accept),
|
||||||
"ReConnectLink failure 8");
|
"ReConnectLink failure 8");
|
||||||
num_error = 0;
|
num_error = 0;
|
||||||
for (int i = 0; i < num_conn; ++i) {
|
for (int i = 0; i < num_conn; ++i) {
|
||||||
LinkRecord r;
|
LinkRecord r;
|
||||||
int hport, hrank;
|
int hport, hrank;
|
||||||
std::string hname;
|
std::string hname;
|
||||||
master.RecvStr(&hname);
|
tracker.RecvStr(&hname);
|
||||||
utils::Assert(master.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "ReConnectLink failure 9");
|
utils::Assert(tracker.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "ReConnectLink failure 9");
|
||||||
utils::Assert(master.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), "ReConnectLink failure 10");
|
utils::Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), "ReConnectLink failure 10");
|
||||||
r.sock.Create();
|
r.sock.Create();
|
||||||
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
|
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
|
||||||
num_error += 1; r.sock.Close(); continue;
|
num_error += 1; r.sock.Close(); continue;
|
||||||
@ -186,12 +186,12 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
}
|
}
|
||||||
if (!match) links.push_back(r);
|
if (!match) links.push_back(r);
|
||||||
}
|
}
|
||||||
utils::Assert(master.SendAll(&num_error, sizeof(num_error)) == sizeof(num_error), "ReConnectLink failure 14");
|
utils::Assert(tracker.SendAll(&num_error, sizeof(num_error)) == sizeof(num_error), "ReConnectLink failure 14");
|
||||||
} while (num_error != 0);
|
} while (num_error != 0);
|
||||||
// send back socket listening port to master
|
// send back socket listening port to tracker
|
||||||
utils::Assert(master.SendAll(&port, sizeof(port)) == sizeof(port), "ReConnectLink failure 14");
|
utils::Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port), "ReConnectLink failure 14");
|
||||||
// close connection to master
|
// close connection to tracker
|
||||||
master.Close();
|
tracker.Close();
|
||||||
// listen to incoming links
|
// listen to incoming links
|
||||||
for (int i = 0; i < num_accept; ++i) {
|
for (int i = 0; i < num_accept; ++i) {
|
||||||
LinkRecord r;
|
LinkRecord r;
|
||||||
|
|||||||
@ -260,9 +260,9 @@ class AllreduceBase : public IEngine {
|
|||||||
std::vector<uint64_t> buffer_;
|
std::vector<uint64_t> buffer_;
|
||||||
};
|
};
|
||||||
/*!
|
/*!
|
||||||
* \brief connect to the master to fix the the missing links
|
* \brief connect to the tracker to fix the the missing links
|
||||||
* this function is also used when the engine start up
|
* this function is also used when the engine start up
|
||||||
* \param cmd possible command to sent to master
|
* \param cmd possible command to sent to tracker
|
||||||
*/
|
*/
|
||||||
void ReConnectLinks(const char *cmd = "start");
|
void ReConnectLinks(const char *cmd = "start");
|
||||||
/*!
|
/*!
|
||||||
@ -316,10 +316,10 @@ class AllreduceBase : public IEngine {
|
|||||||
std::string task_id;
|
std::string task_id;
|
||||||
// uri of current host, to be set by Init
|
// uri of current host, to be set by Init
|
||||||
std::string host_uri;
|
std::string host_uri;
|
||||||
// uri of master
|
// uri of tracker
|
||||||
std::string master_uri;
|
std::string tracker_uri;
|
||||||
// port of master address
|
// port of tracker address
|
||||||
int master_port;
|
int tracker_port;
|
||||||
// port of slave process
|
// port of slave process
|
||||||
int slave_port, nport_trial;
|
int slave_port, nport_trial;
|
||||||
// reduce buffer size
|
// reduce buffer size
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
Master script for rabit
|
Tracker script for rabit
|
||||||
Implements the master control protocol to start rabit jobs and assign necessary information
|
Implements the tracker control protocol
|
||||||
|
- start rabit jobs
|
||||||
|
- help nodes to establish links with each other
|
||||||
|
|
||||||
Tianqi Chen
|
Tianqi Chen
|
||||||
"""
|
"""
|
||||||
@ -128,8 +130,8 @@ class Tracker:
|
|||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.sock.close()
|
self.sock.close()
|
||||||
def slave_args(self):
|
def slave_args(self):
|
||||||
return ['master_uri=%s' % socket.gethostname(),
|
return ['rabit_tracker_uri=%s' % socket.gethostname(),
|
||||||
'master_port=%s' % self.port]
|
'rabit_tracker_port=%s' % self.port]
|
||||||
def accept_slaves(self, nslave):
|
def accept_slaves(self, nslave):
|
||||||
# set of nodes that finishs the job
|
# set of nodes that finishs the job
|
||||||
shutdown = {}
|
shutdown = {}
|
||||||
|
|||||||
@ -1,8 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
if [ "$#" -lt 4 ];
|
|
||||||
then
|
|
||||||
echo "Usage <nslave> <ndata> <config> <round_files_dir>"
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
../submit_job.py $1 test_recover "${@:2}"
|
|
||||||
Loading…
x
Reference in New Issue
Block a user