add speed test

This commit is contained in:
tqchen
2014-12-06 11:05:24 -08:00
parent 19631ecef6
commit 0e012cb05e
6 changed files with 234 additions and 36 deletions

View File

@@ -24,12 +24,12 @@ AllreduceRobust::AllreduceRobust(void) {
void AllreduceRobust::Shutdown(void) {
// need to sync the exec before we shutdown, do a pesudo check point
// execute checkpoint, note: when checkpoint existing, load will not happen
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kMaxSeq),
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp),
"check point must return true");
// reset result buffer
resbuf.Clear(); seq_counter = 0;
// execute check ack step, load happens here
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kMaxSeq),
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
"check ack must return true");
AllreduceBase::Shutdown();
}
@@ -133,7 +133,7 @@ int AllreduceRobust::LoadCheckPoint(utils::ISerializable *global_model,
utils::ISerializable *local_model) {
utils::Check(local_model == NULL, "CheckPoint local_model is not yet supported");
// check if we succesfll
if (RecoverExec(NULL, 0, ActionSummary::kLoadCheck, ActionSummary::kMaxSeq)) {
if (RecoverExec(NULL, 0, ActionSummary::kLoadCheck, ActionSummary::kSpecialOp)) {
// reset result buffer
resbuf.Clear(); seq_counter = 0;
// load from buffer
@@ -142,7 +142,7 @@ int AllreduceRobust::LoadCheckPoint(utils::ISerializable *global_model,
if (version_number == 0) return version_number;
global_model->Load(fs);
// run another phase of check ack, if recovered from data
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kMaxSeq),
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
"check ack must return true");
return version_number;
} else {
@@ -172,7 +172,7 @@ void AllreduceRobust::CheckPoint(const utils::ISerializable *global_model,
const utils::ISerializable *local_model) {
utils::Assert(local_model == NULL, "CheckPoint local model is not supported yet");
// execute checkpoint, note: when checkpoint existing, load will not happen
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kMaxSeq),
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp),
"check point must return true");
// this is the critical region where we will change all the stored models
// increase version number
@@ -185,7 +185,7 @@ void AllreduceRobust::CheckPoint(const utils::ISerializable *global_model,
// reset result buffer
resbuf.Clear(); seq_counter = 0;
// execute check ack step, load happens here
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kMaxSeq),
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
"check ack must return true");
}
/*!
@@ -608,6 +608,10 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
*/
AllreduceRobust::ReturnType
AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool requester) { RecoverType role;
// if minimum sequence requested is local check point ack,
// this means all nodes have finished local check point, directly return
if (seqno == ActionSummary::kLocalCheckAck) return kSuccess;
if (!requester) {
sendrecvbuf = resbuf.Query(seqno, &size);
role = sendrecvbuf != NULL ? kHaveData : kPassData;
@@ -631,7 +635,7 @@ AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool re
* \param size the total size of the buffer
* \param flag flag information about the action \sa ActionSummary
* \param seqno sequence number of the action, if it is special action with flag set,
* seqno needs to be set to ActionSummary::kMaxSeq
* seqno needs to be set to ActionSummary::kSpecialOp
*
* \return if this function can return true or false
* - true means buf already set to the
@@ -640,7 +644,7 @@ AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool re
*/
bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
if (flag != 0) {
utils::Assert(seqno == ActionSummary::kMaxSeq, "must only set seqno for normal operations");
utils::Assert(seqno == ActionSummary::kSpecialOp, "must only set seqno for normal operations");
}
// request
ActionSummary req(flag, seqno);
@@ -672,7 +676,7 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
} else {
if (act.check_point()) {
if (act.diff_seq()) {
utils::Assert(act.min_seqno() != ActionSummary::kMaxSeq, "min seq bug");
utils::Assert(act.min_seqno() != ActionSummary::kSpecialOp, "min seq bug");
bool requester = req.min_seqno() == act.min_seqno();
if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
if (requester) return true;
@@ -691,7 +695,7 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
if (req.load_check()) return true;
} else {
// no special flags, no checkpoint, check ack, load_check
utils::Assert(act.min_seqno() != ActionSummary::kMaxSeq, "min seq bug");
utils::Assert(act.min_seqno() != ActionSummary::kSpecialOp, "min seq bug");
if (act.diff_seq()) {
bool requester = req.min_seqno() == act.min_seqno();
if (!CheckAndRecover(TryGetResult(buf, size, act.min_seqno(), requester))) continue;
@@ -708,7 +712,54 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
utils::Assert(false, "RecoverExec: should not reach here");
return true;
}
/*!
* \brief try to recover the local state, making each local state to be the result of itself
* plus replication of states in previous num_local_replica hops in the ring
*
* The input parameters must contain the valid local states available in current nodes,
* This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
* If there is sufficient information in the ring, when the function returns, local_chkpt will
* contain num_local_replica + 1 checkpoints (including the chkpt of this node)
* If there is no sufficient information in the ring, this function the number of checkpoints
* will be less than the specified value
*
* \param p_local_rptr the pointer to the segment pointers in the states array
* \param p_local_chkpt the pointer to the storage of local check points
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
* \sa ReturnType
*/
AllreduceRobust::ReturnType
AllreduceRobust::TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
std::string *p_local_chkpt) {
// if there is no local replica, we can do nothing
if (num_local_replica == 0) return kSuccess;
std::vector<size_t> &rptr = *p_local_rptr;
std::string &chkpt = *p_local_chkpt;
if (rptr.size() == 0) {
rptr.push_back(0);
utils::Assert(chkpt.length() == 0, "local chkpt space inconsistent");
}
const int n = num_local_replica;
// message send to previous link
{
int msg_forward[2];
int nlocal = static_cast<int>(rptr.size() - 1);
msg_forward[0] = nlocal;
utils::Assert(msg_forward[0] <= n, "invalid local replica");
// backward passing one hop the request
ReturnType succ = RingPassing(msg_forward,
1 * sizeof(int), 2 * sizeof(int),
0 * sizeof(int), 1 * sizeof(int),
ring_prev, ring_next);
if (succ != kSuccess) return succ;
// check how much current node can help with the request
// if (nlocal > ) {
//}
}
return kSuccess;
}
/*!
* \brief perform a ring passing to receive data from prev link, and sent data to next link
* this allows data to stream over a ring structure
@@ -723,8 +774,8 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
* \param read_end the ending position to read
* \param write_ptr the initial write pointer
* \param write_end the ending position to write
* \param prev pointer to link to previous position in ring
* \param prev pointer to link of next position in ring
* \param read_link pointer to link to previous position in ring
* \param write_link pointer to link of next position in ring
*/
AllreduceRobust::ReturnType
AllreduceRobust::RingPassing(void *sendrecvbuf_,
@@ -732,14 +783,14 @@ AllreduceRobust::RingPassing(void *sendrecvbuf_,
size_t read_end,
size_t write_ptr,
size_t write_end,
LinkRecord *prev_link,
LinkRecord *next_link) {
LinkRecord *read_link,
LinkRecord *write_link) {
if (links.size() == 0 || read_end == 0) return kSuccess;
utils::Assert(read_end <= write_end, "boundary check");
utils::Assert(read_ptr <= read_end, "boundary check");
utils::Assert(write_ptr <= write_end, "boundary check");
// take reference
LinkRecord &prev = *prev_link, &next = *next_link;
LinkRecord &prev = *read_link, &next = *write_link;
// send recv buffer
char *buf = reinterpret_cast<char*>(sendrecvbuf_);
while (true) {

View File

@@ -122,7 +122,11 @@ class AllreduceRobust : public AllreduceBase {
*/
struct ActionSummary {
// maximumly allowed sequence id
const static int kMaxSeq = 1 << 26;
const static int kSpecialOp = (1 << 26);
// special sequence number for local state checkpoint
const static int kLocalCheckPoint = (1 << 26) - 2;
// special sequnce number for local state checkpoint ack signal
const static int kLocalCheckAck = (1 << 26) - 1;
//---------------------------------------------
// The following are bit mask of flag used in
//----------------------------------------------
@@ -140,7 +144,7 @@ class AllreduceRobust : public AllreduceBase {
// constructor
ActionSummary(void) {}
// constructor of action
ActionSummary(int flag, int minseqno = kMaxSeq) {
ActionSummary(int flag, int minseqno = kSpecialOp) {
seqcode = (minseqno << 4) | flag;
}
// minimum number of all operations
@@ -277,14 +281,14 @@ class AllreduceRobust : public AllreduceBase {
* \param buf the buffer to store the result
* \param size the total size of the buffer
* \param flag flag information about the action \sa ActionSummary
* \param seqno sequence number of the action, if it is special action with flag set, seqno needs to be set to ActionSummary::kMaxSeq
* \param seqno sequence number of the action, if it is special action with flag set, seqno needs to be set to ActionSummary::kSpecialOp
*
* \return if this function can return true or false
* - true means buf already set to the
* result by recovering procedure, the action is complete, no further action is needed
* - false means this is the lastest action that has not yet been executed, need to execute the action
*/
bool RecoverExec(void *buf, size_t size, int flag, int seqno = ActionSummary::kMaxSeq);
bool RecoverExec(void *buf, size_t size, int flag, int seqno = ActionSummary::kSpecialOp);
/*!
* \brief try to load check point
*
@@ -344,12 +348,30 @@ class AllreduceRobust : public AllreduceBase {
*
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
* \sa ReturnType, TryDecideRouting
*/
*/
ReturnType TryRecoverData(RecoverType role,
void *sendrecvbuf_,
size_t size,
int recv_link,
const std::vector<bool> &req_in);
/*!
* \brief try to recover the local state, making each local state to be the result of itself
* plus replication of states in previous num_local_replica hops in the ring
*
* The input parameters must contain the valid local states available in current nodes,
* This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
* If there is sufficient information in the ring, when the function returns, local_chkpt will
* contain num_local_replica + 1 checkpoints (including the chkpt of this node)
* If there is no sufficient information in the ring, this function the number of checkpoints
* will be less than the specified value
*
* \param p_local_rptr the pointer to the segment pointers in the states array
* \param p_local_chkpt the pointer to the storage of local check points
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
* \sa ReturnType
*/
ReturnType TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
std::string *p_local_chkpt);
/*!
* \brief perform a ring passing to receive data from prev link, and sent data to next link
* this allows data to stream over a ring structure
@@ -364,16 +386,16 @@ class AllreduceRobust : public AllreduceBase {
* \param read_end the ending position to read
* \param write_ptr the initial write pointer
* \param write_end the ending position to write
* \param prev pointer to link to previous position in ring
* \param prev pointer to link of next position in ring
* \param read_link pointer to link to previous position in ring
* \param write_link pointer to link of next position in ring
*/
ReturnType RingPassing(void *senrecvbuf_,
size_t read_ptr,
size_t read_end,
size_t write_ptr,
size_t write_end,
LinkRecord *prev_link,
LinkRecord *next_link);
LinkRecord *read_link,
LinkRecord *write_link);
/*!
* \brief run message passing algorithm on the allreduce tree
* the result is edge message stored in p_edge_in and p_edge_out
@@ -410,15 +432,18 @@ class AllreduceRobust : public AllreduceBase {
std::string global_checkpoint;
// number of replica for local state/model
int num_local_replica;
// --- recovery data structure for local checkpoint
// there is two version of the data structure,
// at one time one version is valid and another is used as temp memory
// pointer to memory position in the local model
// local model is stored in CSR format(like a sparse matrices)
// local_model[rptr[0]:rptr[1]] stores the model of current node
// local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops in the ring
std::vector<size_t> local_rptr;
std::vector<size_t> local_rptr[2];
// storage for local model replicas
std::string local_checkpoint;
// temporal storage for doing local checkpointing
std::string tmp_local_check;
std::string local_checkpoint[2];
// version of local checkpoint can be 1 or 0
int local_chkpt_version;
};
} // namespace engine
} // namespace rabit

View File

@@ -166,7 +166,7 @@ class Tracker:
s.assign_rank(rank, wait_conn, nslave)
if s.wait_accept > 0:
wait_conn[rank] = s
print 'all slaves setup complete'
print 'All nodes finishes job'
def mpi_submit(nslave, args):
cmd = ' '.join(['mpirun -n %d' % nslave] + args)

23
src/timer.h Normal file
View File

@@ -0,0 +1,23 @@
/*!
* \file timer.h
* \brief This file defines the utils for timing
* \author Tianqi Chen, Nacho, Tianyi
*/
#ifndef RABIT_TIMER_H
#define RABIT_TIMER_H
#include <time.h>
#include "./utils.h"
namespace rabit {
namespace utils {
/*!
* \brief return time in seconds
*/
inline double GetTime(void) {
timespec ts;
utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
}
}
}
#endif