execute it like this: ./test.sh 4 4000 testcase0.conf ./
Now we are passing the folder where the round instances are saved. The problem is that calling utils::Check or utils::Assert on 1 or 2 nodes, shutdowns all of them. Only those should be shutdown and this will work. There maybe some other mechanism to shutdown a particular node. Tianqi?
This commit is contained in:
parent
faed8285cd
commit
a8128493c2
52
src/mock.h
52
src/mock.h
@ -7,7 +7,9 @@
|
|||||||
*/
|
*/
|
||||||
#include "./allreduce.h"
|
#include "./allreduce.h"
|
||||||
#include "./config.h"
|
#include "./config.h"
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
|
||||||
/*! \brief namespace of mock */
|
/*! \brief namespace of mock */
|
||||||
@ -18,8 +20,8 @@ class Mock {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
Mock(const int& rank, char *config) : rank(rank) {
|
explicit Mock(const int& rank, char *config, char* round_dir) : rank(rank) {
|
||||||
Init(config);
|
Init(config, round_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename OP>
|
template<typename OP>
|
||||||
@ -46,20 +48,42 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
inline void Init(char* config) {
|
inline void Init(char* config, char* round_dir) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << round_dir << "node" << rank << ".round";
|
||||||
|
const char* round_file = ss.str().c_str();
|
||||||
|
std::ifstream ifs(round_file);
|
||||||
|
int current_round = 1;
|
||||||
|
if (!ifs.good()) {
|
||||||
|
// file does not exists, it's the first time, so save the current round to 1
|
||||||
|
std::ofstream ofs(round_file);
|
||||||
|
ofs << current_round;
|
||||||
|
ofs.close();
|
||||||
|
} else {
|
||||||
|
// file does exists, read the previous round, increment by one, and save it back
|
||||||
|
ifs >> current_round;
|
||||||
|
current_round++;
|
||||||
|
ifs.close();
|
||||||
|
std::ofstream ofs(round_file);
|
||||||
|
ofs << current_round;
|
||||||
|
ofs.close();
|
||||||
|
}
|
||||||
|
printf("[%d] in round %d\n", rank, current_round);
|
||||||
utils::ConfigIterator itr(config);
|
utils::ConfigIterator itr(config);
|
||||||
while (itr.Next()) {
|
while (itr.Next()) {
|
||||||
char round[4], node_rank[4];
|
char round[4], node_rank[4];
|
||||||
sscanf(itr.name(), "%[^_]_%s", round, node_rank);
|
sscanf(itr.name(), "%[^_]_%s", round, node_rank);
|
||||||
int i_round = atoi(round);
|
int i_node_rank = atoi(node_rank);
|
||||||
if (i_round == 1) {
|
// if it's something for me
|
||||||
int i_node_rank = atoi(node_rank);
|
if (i_node_rank == rank) {
|
||||||
if (i_node_rank == rank) {
|
int i_round = atoi(round);
|
||||||
printf("[%d] round %d, value %s\n", rank, i_round, itr.val());
|
// in my current round
|
||||||
if (strcmp("allreduce", itr.val())) record(allReduce);
|
if (i_round == current_round) {
|
||||||
else if (strcmp("broadcast", itr.val())) record(broadcast);
|
printf("[%d] round %d, value %s\n", rank, i_round, itr.val());
|
||||||
else if (strcmp("loadcheckpoint", itr.val())) record(loadCheckpoint);
|
if (strcmp("allreduce", itr.val())) record(allReduce);
|
||||||
else if (strcmp("checkpoint", itr.val())) record(checkpoint);
|
else if (strcmp("broadcast", itr.val())) record(broadcast);
|
||||||
|
else if (strcmp("loadcheckpoint", itr.val())) record(loadCheckpoint);
|
||||||
|
else if (strcmp("checkpoint", itr.val())) record(checkpoint);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -82,6 +106,8 @@ private:
|
|||||||
std::map<int,bool> broadcast;
|
std::map<int,bool> broadcast;
|
||||||
std::map<int,bool> loadCheckpoint;
|
std::map<int,bool> loadCheckpoint;
|
||||||
std::map<int,bool> checkpoint;
|
std::map<int,bool> checkpoint;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
if [ "$#" -ne 3 ];
|
if [ "$#" -ne 4 ];
|
||||||
then
|
then
|
||||||
echo "Usage <nslave> <ndata> <config>"
|
echo "Usage <nslave> <ndata> <config> <round_files_dir>"
|
||||||
exit -1
|
exit -1
|
||||||
fi
|
fi
|
||||||
../submit_job_tcp.py $1 test_allreduce $2 $3
|
|
||||||
|
../submit_job_tcp.py $1 test_allreduce $2 $3 $4
|
||||||
@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
|
|||||||
int rank = sync::GetRank();
|
int rank = sync::GetRank();
|
||||||
std::string name = sync::GetProcessorName();
|
std::string name = sync::GetProcessorName();
|
||||||
|
|
||||||
test::Mock mock(rank, argv[2]);
|
test::Mock mock(rank, argv[2], argv[3]);
|
||||||
|
|
||||||
printf("[%d] start at %s\n", rank, name.c_str());
|
printf("[%d] start at %s\n", rank, name.c_str());
|
||||||
TestMax(mock, n);
|
TestMax(mock, n);
|
||||||
|
|||||||
@ -5,8 +5,5 @@
|
|||||||
|
|
||||||
1_0 = allreduce
|
1_0 = allreduce
|
||||||
1_1 = broadcast
|
1_1 = broadcast
|
||||||
1_2 = loadcheckpoint
|
|
||||||
1_3 = checkpoint
|
|
||||||
|
|
||||||
2_0 = allreduce
|
2_2 = allreduce
|
||||||
2_2 = checkpoint
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user