execute it like this: ./test.sh 4 4000 testcase0.conf ./
Now we are passing the folder where the round instances are saved. The problem is that calling utils::Check or utils::Assert on 1 or 2 nodes, shutdowns all of them. Only those should be shutdown and this will work. There maybe some other mechanism to shutdown a particular node. Tianqi?
This commit is contained in:
parent
faed8285cd
commit
a8128493c2
52
src/mock.h
52
src/mock.h
@ -7,7 +7,9 @@
|
||||
*/
|
||||
#include "./allreduce.h"
|
||||
#include "./config.h"
|
||||
#include <map>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
|
||||
/*! \brief namespace of mock */
|
||||
@ -18,8 +20,8 @@ class Mock {
|
||||
|
||||
public:
|
||||
|
||||
Mock(const int& rank, char *config) : rank(rank) {
|
||||
Init(config);
|
||||
explicit Mock(const int& rank, char *config, char* round_dir) : rank(rank) {
|
||||
Init(config, round_dir);
|
||||
}
|
||||
|
||||
template<typename OP>
|
||||
@ -46,20 +48,42 @@ public:
|
||||
|
||||
private:
|
||||
|
||||
inline void Init(char* config) {
|
||||
inline void Init(char* config, char* round_dir) {
|
||||
std::stringstream ss;
|
||||
ss << round_dir << "node" << rank << ".round";
|
||||
const char* round_file = ss.str().c_str();
|
||||
std::ifstream ifs(round_file);
|
||||
int current_round = 1;
|
||||
if (!ifs.good()) {
|
||||
// file does not exists, it's the first time, so save the current round to 1
|
||||
std::ofstream ofs(round_file);
|
||||
ofs << current_round;
|
||||
ofs.close();
|
||||
} else {
|
||||
// file does exists, read the previous round, increment by one, and save it back
|
||||
ifs >> current_round;
|
||||
current_round++;
|
||||
ifs.close();
|
||||
std::ofstream ofs(round_file);
|
||||
ofs << current_round;
|
||||
ofs.close();
|
||||
}
|
||||
printf("[%d] in round %d\n", rank, current_round);
|
||||
utils::ConfigIterator itr(config);
|
||||
while (itr.Next()) {
|
||||
char round[4], node_rank[4];
|
||||
sscanf(itr.name(), "%[^_]_%s", round, node_rank);
|
||||
int i_round = atoi(round);
|
||||
if (i_round == 1) {
|
||||
int i_node_rank = atoi(node_rank);
|
||||
if (i_node_rank == rank) {
|
||||
printf("[%d] round %d, value %s\n", rank, i_round, itr.val());
|
||||
if (strcmp("allreduce", itr.val())) record(allReduce);
|
||||
else if (strcmp("broadcast", itr.val())) record(broadcast);
|
||||
else if (strcmp("loadcheckpoint", itr.val())) record(loadCheckpoint);
|
||||
else if (strcmp("checkpoint", itr.val())) record(checkpoint);
|
||||
int i_node_rank = atoi(node_rank);
|
||||
// if it's something for me
|
||||
if (i_node_rank == rank) {
|
||||
int i_round = atoi(round);
|
||||
// in my current round
|
||||
if (i_round == current_round) {
|
||||
printf("[%d] round %d, value %s\n", rank, i_round, itr.val());
|
||||
if (strcmp("allreduce", itr.val())) record(allReduce);
|
||||
else if (strcmp("broadcast", itr.val())) record(broadcast);
|
||||
else if (strcmp("loadcheckpoint", itr.val())) record(loadCheckpoint);
|
||||
else if (strcmp("checkpoint", itr.val())) record(checkpoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -82,6 +106,8 @@ private:
|
||||
std::map<int,bool> broadcast;
|
||||
std::map<int,bool> loadCheckpoint;
|
||||
std::map<int,bool> checkpoint;
|
||||
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" -ne 3 ];
|
||||
if [ "$#" -ne 4 ];
|
||||
then
|
||||
echo "Usage <nslave> <ndata> <config>"
|
||||
echo "Usage <nslave> <ndata> <config> <round_files_dir>"
|
||||
exit -1
|
||||
fi
|
||||
../submit_job_tcp.py $1 test_allreduce $2 $3
|
||||
|
||||
../submit_job_tcp.py $1 test_allreduce $2 $3 $4
|
||||
@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
|
||||
int rank = sync::GetRank();
|
||||
std::string name = sync::GetProcessorName();
|
||||
|
||||
test::Mock mock(rank, argv[2]);
|
||||
test::Mock mock(rank, argv[2], argv[3]);
|
||||
|
||||
printf("[%d] start at %s\n", rank, name.c_str());
|
||||
TestMax(mock, n);
|
||||
|
||||
@ -5,8 +5,5 @@
|
||||
|
||||
1_0 = allreduce
|
||||
1_1 = broadcast
|
||||
1_2 = loadcheckpoint
|
||||
1_3 = checkpoint
|
||||
|
||||
2_0 = allreduce
|
||||
2_2 = checkpoint
|
||||
2_2 = allreduce
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user