lint and travis

This commit is contained in:
tqchen
2015-07-03 15:15:11 -07:00
parent ceedf4ea96
commit 3cc49ad0e8
27 changed files with 423 additions and 296 deletions

View File

@@ -94,7 +94,8 @@ void AllreduceBase::Init(void) {
}
}
if (dmlc_role != "worker") {
fprintf(stderr, "Rabit Module currently only work with dmlc worker, quit this program by exit 0\n");
fprintf(stderr, "Rabit Module currently only work with dmlc worker"\
", quit this program by exit 0\n");
exit(0);
}
// clear the setting before start reconnection
@@ -134,7 +135,7 @@ void AllreduceBase::TrackerPrint(const std::string &msg) {
// util to parse data with unit suffix
inline size_t ParseUnit(const char *name, const char *val) {
char unit;
unsigned long amt;
unsigned long amt; // NOLINT(*)
int n = sscanf(val, "%lu%c", &amt, &unit);
size_t amount = amt;
if (n == 2) {
@@ -154,7 +155,7 @@ inline size_t ParseUnit(const char *name, const char *val) {
}
}
/*!
* \brief set parameters to the engine
* \brief set parameters to the engine
* \param name parameter name
* \param val parameter value
*/
@@ -258,7 +259,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
} else {
if (!all_links[i].sock.IsClosed()) all_links[i].sock.Close();
}
}
}
int ngood = static_cast<int>(good_link.size());
Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
"ReConnectLink failure 5");
@@ -359,7 +360,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
* The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
* It only means the current node get the correct result of Allreduce.
* However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
*
*
* \param sendrecvbuf_ buffer for both sending and recving data
* \param type_nbytes the unit number of bytes the type have
* \param count number of elements to be reduced
@@ -440,7 +441,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
selecter.WatchRead(links[i].sock);
}
// size_write <= size_read
if (links[i].size_write != total_size){
if (links[i].size_write != total_size) {
if (links[i].size_write < size_down_in) {
selecter.WatchWrite(links[i].sock);
}
@@ -477,7 +478,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
size_t max_reduce = total_size;
for (int i = 0; i < nlink; ++i) {
if (i != parent_index) {
max_reduce= std::min(max_reduce, links[i].size_read);
max_reduce = std::min(max_reduce, links[i].size_read);
utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
"buffer size inconsistent");
buffer_size = links[i].buffer_size;
@@ -525,7 +526,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
ssize_t len = links[parent_index].sock.
Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
if (len == 0) {
links[parent_index].sock.Close();
links[parent_index].sock.Close();
return ReportError(&links[parent_index], kRecvZeroLen);
}
if (len != -1) {
@@ -670,7 +671,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
size_t slice_begin,
size_t slice_end,
size_t size_prev_slice) {
// read from next link and send to prev one
// read from next link and send to prev one
LinkRecord &prev = *ring_prev, &next = *ring_next;
// need to reply on special rank structure
utils::Assert(next.rank == (rank + 1) % world_size &&
@@ -678,11 +679,11 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
"need to assume rank structure");
// send recv buffer
char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
const size_t stop_read = total_size + slice_begin;
const size_t stop_write = total_size + slice_begin - size_prev_slice;
const size_t stop_read = total_size + slice_begin;
const size_t stop_write = total_size + slice_begin - size_prev_slice;
size_t write_ptr = slice_begin;
size_t read_ptr = slice_end;
while (true) {
// select helper
bool finished = true;
@@ -733,7 +734,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
/*!
* \brief perform in-place allreduce, on sendrecvbuf, this function can fail,
* and will return the cause of failure
*
*
* Ring-based algorithm
*
* \param sendrecvbuf_ buffer for both sending and recving data
@@ -748,7 +749,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
size_t type_nbytes,
size_t count,
ReduceFunction reducer) {
// read from next link and send to prev one
// read from next link and send to prev one
LinkRecord &prev = *ring_prev, &next = *ring_next;
// need to reply on special rank structure
utils::Assert(next.rank == (rank + 1) % world_size &&
@@ -757,7 +758,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
// total size of message
const size_t total_size = type_nbytes * count;
size_t n = static_cast<size_t>(world_size);
size_t step = (count + n - 1) / n;
size_t step = (count + n - 1) / n;
size_t r = static_cast<size_t>(next.rank);
size_t write_ptr = std::min(r * step, count) * type_nbytes;
size_t read_ptr = std::min((r + 1) * step, count) * type_nbytes;
@@ -830,7 +831,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
if (ret != kSuccess) return ReportError(&prev, ret);
}
}
}
}
return kSuccess;
}
/*!
@@ -857,7 +858,7 @@ AllreduceBase::TryAllreduceRing(void *sendrecvbuf_,
size_t end = std::min((rank + 1) * step, count) * type_nbytes;
// previous rank
int prank = ring_prev->rank;
// get rank of previous
// get rank of previous
return TryAllgatherRing
(sendrecvbuf_, type_nbytes * count,
begin, end,

View File

@@ -42,7 +42,7 @@ class AllreduceBase : public IEngine {
// shutdown the engine
virtual void Shutdown(void);
/*!
* \brief set parameters to the engine
* \brief set parameters to the engine
* \param name parameter name
* \param val parameter value
*/
@@ -72,7 +72,7 @@ class AllreduceBase : public IEngine {
return host_uri;
}
/*!
* \brief perform in-place allreduce, on sendrecvbuf
* \brief perform in-place allreduce, on sendrecvbuf
* this function is NOT thread-safe
* \param sendrecvbuf_ buffer for both sending and recving data
* \param type_nbytes the unit number of bytes the type have
@@ -82,7 +82,7 @@ class AllreduceBase : public IEngine {
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to passed into the lazy preprocessing function
*/
*/
virtual void Allreduce(void *sendrecvbuf_,
size_t type_nbytes,
size_t count,
@@ -117,14 +117,14 @@ class AllreduceBase : public IEngine {
* \return the version number of check point loaded
* if returned version == 0, this means no model has been CheckPointed
* the p_model is not touched, user should do necessary initialization by themselves
*
*
* Common usage example:
* int iter = rabit::LoadCheckPoint(&model);
* if (iter == 0) model.InitParameters();
* for (i = iter; i < max_iter; ++i) {
* do many things, include allreduce
* rabit::CheckPoint(model);
* }
* }
*
* \sa CheckPoint, VersionNumber
*/
@@ -135,7 +135,7 @@ class AllreduceBase : public IEngine {
/*!
* \brief checkpoint the model, meaning we finished a stage of execution
* every time we call check point, there is a version number which will increase by one
*
*
* \param global_model pointer to the globally shared model/state
* when calling this function, the caller need to gauranttees that global_model
* is the same in all nodes
@@ -155,16 +155,16 @@ class AllreduceBase : public IEngine {
/*!
* \brief This function can be used to replace CheckPoint for global_model only,
* when certain condition is met(see detailed expplaination).
*
*
* This is a "lazy" checkpoint such that only the pointer to global_model is
* remembered and no memory copy is taken. To use this function, the user MUST ensure that:
* The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
* In another words, global_model model can be changed only between last call of
* In another words, global_model model can be changed only between last call of
* Allreduce/Broadcast and LazyCheckPoint in current version
*
*
* For example, suppose the calling sequence is:
* LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
*
*
* If user can only changes global_model in code3, then LazyCheckPoint can be used to
* improve efficiency of the program.
* \param global_model pointer to the globally shared model/state
@@ -191,8 +191,8 @@ class AllreduceBase : public IEngine {
virtual void InitAfterException(void) {
utils::Error("InitAfterException: not implemented");
}
/*!
* \brief report current status to the job tracker
/*!
* \brief report current status to the job tracker
* depending on the job tracker we are in
*/
inline void ReportStatus(void) const {
@@ -213,7 +213,7 @@ class AllreduceBase : public IEngine {
kRecvZeroLen,
/*! \brief a neighbor node go down, the connection is dropped */
kSockError,
/*!
/*!
* \brief another node which is not my neighbor go down,
* get Out-of-Band exception notification from my neighbor
*/
@@ -225,7 +225,7 @@ class AllreduceBase : public IEngine {
ReturnTypeEnum value;
// constructor
ReturnType() {}
ReturnType(ReturnTypeEnum value) : value(value){}
ReturnType(ReturnTypeEnum value) : value(value) {} // NOLINT(*)
inline bool operator==(const ReturnTypeEnum &v) const {
return value == v;
}
@@ -235,11 +235,11 @@ class AllreduceBase : public IEngine {
};
/*! \brief translate errno to return type */
inline static ReturnType Errno2Return() {
int errsv = utils::Socket::GetLastError();
int errsv = utils::Socket::GetLastError();
if (errsv == EAGAIN || errsv == EWOULDBLOCK || errsv == 0) return kSuccess;
#ifdef _WIN32
if (errsv == WSAEWOULDBLOCK) return kSuccess;
if (errsv == WSAECONNRESET) return kConnReset;
if (errsv == WSAEWOULDBLOCK) return kSuccess;
if (errsv == WSAECONNRESET) return kConnReset;
#endif
if (errsv == ECONNRESET) return kConnReset;
return kSockError;
@@ -260,7 +260,7 @@ class AllreduceBase : public IEngine {
// buffer size, in bytes
size_t buffer_size;
// constructor
LinkRecord(void)
LinkRecord(void)
: buffer_head(NULL), buffer_size(0) {
}
// initialize buffer
@@ -377,7 +377,7 @@ class AllreduceBase : public IEngine {
* The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
* It only means the current node get the correct result of Allreduce.
* However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
*
*
* \param sendrecvbuf_ buffer for both sending and recving data
* \param type_nbytes the unit number of bytes the type have
* \param count number of elements to be reduced
@@ -397,7 +397,7 @@ class AllreduceBase : public IEngine {
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
* \sa ReturnType
*/
ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
/*!
* \brief perform in-place allreduce, on sendrecvbuf,
* this function implements tree-shape reduction
@@ -433,14 +433,14 @@ class AllreduceBase : public IEngine {
size_t size_prev_slice);
/*!
* \brief perform in-place allreduce, reduce on the sendrecvbuf,
*
*
* after the function, node k get k-th segment of the reduction result
* the k-th segment is defined by [k * step, min((k + 1) * step,count) )
* where step = ceil(count / world_size)
*
* \param sendrecvbuf_ buffer for both sending and recving data
* \param type_nbytes the unit number of bytes the type have
* \param count number of elements to be reduced
* \param count number of elements to be reduced
* \param reducer reduce function
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
* \sa ReturnType, TryAllreduce
@@ -465,7 +465,7 @@ class AllreduceBase : public IEngine {
size_t count,
ReduceFunction reducer);
/*!
* \brief function used to report error when a link goes wrong
* \brief function used to report error when a link goes wrong
* \param link the pointer to the link who causes the error
* \param err the error type
*/
@@ -522,4 +522,4 @@ class AllreduceBase : public IEngine {
};
} // namespace engine
} // namespace rabit
#endif // RABIT_ALLREDUCE_BASE_H
#endif // RABIT_ALLREDUCE_BASE_H_

View File

@@ -1,8 +1,9 @@
/*!
* Copyright by Contributors
* \file allreduce_mock.h
* \brief Mock test module of AllReduce engine,
* insert failures in certain call point, to test if the engine is robust to failure
*
*
* \author Ignacio Cano, Tianqi Chen
*/
#ifndef RABIT_ALLREDUCE_MOCK_H_
@@ -68,7 +69,7 @@ class AllreduceMock : public AllreduceRobust {
DummySerializer dum;
ComboSerializer com(global_model, local_model);
return AllreduceRobust::LoadCheckPoint(&dum, &com);
}
}
}
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model) {
@@ -100,6 +101,7 @@ class AllreduceMock : public AllreduceRobust {
this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
AllreduceRobust::LazyCheckPoint(global_model);
}
protected:
// force checkpoint to local
int force_local;
@@ -108,7 +110,7 @@ class AllreduceMock : public AllreduceRobust {
// sum of allreduce
double tsum_allreduce;
double time_checkpoint;
private:
struct DummySerializer : public Serializable {
virtual void Load(Stream *fi) {
@@ -126,7 +128,7 @@ class AllreduceMock : public AllreduceRobust {
}
ComboSerializer(const Serializable *lhs, const Serializable *rhs)
: lhs(NULL), rhs(NULL), c_lhs(lhs), c_rhs(rhs) {
}
}
virtual void Load(Stream *fi) {
if (lhs != NULL) lhs->Load(fi);
if (rhs != NULL) rhs->Load(fi);
@@ -143,10 +145,10 @@ class AllreduceMock : public AllreduceRobust {
int seqno;
int ntrial;
MockKey(void) {}
MockKey(int rank, int version, int seqno, int ntrial)
MockKey(int rank, int version, int seqno, int ntrial)
: rank(rank), version(version), seqno(seqno), ntrial(ntrial) {}
inline bool operator==(const MockKey &b) const {
return rank == b.rank &&
return rank == b.rank &&
version == b.version &&
seqno == b.seqno &&
ntrial == b.ntrial;
@@ -173,4 +175,4 @@ class AllreduceMock : public AllreduceRobust {
};
} // namespace engine
} // namespace rabit
#endif // RABIT_ALLREDUCE_MOCK_H_
#endif // RABIT_ALLREDUCE_MOCK_H_

View File

@@ -2,17 +2,17 @@
* Copyright (c) 2014 by Contributors
* \file allreduce_robust-inl.h
* \brief implementation of inline template function in AllreduceRobust
*
*
* \author Tianqi Chen
*/
#ifndef RABIT_ENGINE_ROBUST_INL_H_
#define RABIT_ENGINE_ROBUST_INL_H_
#ifndef RABIT_ALLREDUCE_ROBUST_INL_H_
#define RABIT_ALLREDUCE_ROBUST_INL_H_
#include <vector>
namespace rabit {
namespace engine {
/*!
* \brief run message passing algorithm on the allreduce tree
* \brief run message passing algorithm on the allreduce tree
* the result is edge message stored in p_edge_in and p_edge_out
* \param node_value the value associated with current node
* \param p_edge_in used to store input message from each of the edge
@@ -35,7 +35,7 @@ inline AllreduceRobust::ReturnType
AllreduceRobust::MsgPassing(const NodeType &node_value,
std::vector<EdgeType> *p_edge_in,
std::vector<EdgeType> *p_edge_out,
EdgeType (*func)
EdgeType(*func)
(const NodeType &node_value,
const std::vector<EdgeType> &edge_in,
size_t out_index)) {
@@ -80,8 +80,16 @@ AllreduceRobust::MsgPassing(const NodeType &node_value,
selecter.WatchRead(links[i].sock);
}
break;
case 1: if (i == parent_index) selecter.WatchWrite(links[i].sock); break;
case 2: if (i == parent_index) selecter.WatchRead(links[i].sock); break;
case 1:
if (i == parent_index) {
selecter.WatchWrite(links[i].sock);
}
break;
case 2:
if (i == parent_index) {
selecter.WatchRead(links[i].sock);
}
break;
case 3:
if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
selecter.WatchWrite(links[i].sock);
@@ -158,4 +166,4 @@ AllreduceRobust::MsgPassing(const NodeType &node_value,
}
} // namespace engine
} // namespace rabit
#endif // RABIT_ENGINE_ROBUST_INL_H_
#endif // RABIT_ALLREDUCE_ROBUST_INL_H_

View File

@@ -27,7 +27,7 @@ AllreduceRobust::AllreduceRobust(void) {
result_buffer_round = 1;
global_lazycheck = NULL;
use_local_model = -1;
recover_counter = 0;
recover_counter = 0;
env_vars.push_back("rabit_global_replica");
env_vars.push_back("rabit_local_replica");
}
@@ -49,7 +49,7 @@ void AllreduceRobust::Shutdown(void) {
AllreduceBase::Shutdown();
}
/*!
* \brief set parameters to the engine
* \brief set parameters to the engine
* \param name parameter name
* \param val parameter value
*/
@@ -61,7 +61,7 @@ void AllreduceRobust::SetParam(const char *name, const char *val) {
}
}
/*!
* \brief perform in-place allreduce, on sendrecvbuf
* \brief perform in-place allreduce, on sendrecvbuf
* this function is NOT thread-safe
* \param sendrecvbuf_ buffer for both sending and recving data
* \param type_nbytes the unit number of bytes the type have
@@ -147,14 +147,14 @@ void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root)
* \return the version number of check point loaded
* if returned version == 0, this means no model has been CheckPointed
* the p_model is not touched, user should do necessary initialization by themselves
*
*
* Common usage example:
* int iter = rabit::LoadCheckPoint(&model);
* if (iter == 0) model.InitParameters();
* for (i = iter; i < max_iter; ++i) {
* do many things, include allreduce
* rabit::CheckPoint(model);
* }
* }
*
* \sa CheckPoint, VersionNumber
*/
@@ -208,7 +208,7 @@ int AllreduceRobust::LoadCheckPoint(Serializable *global_model,
* \brief internal consistency check function,
* use check to ensure user always call CheckPoint/LoadCheckPoint
* with or without local but not both, this function will set the approperiate settings
* in the first call of LoadCheckPoint/CheckPoint
* in the first call of LoadCheckPoint/CheckPoint
*
* \param with_local whether the user calls CheckPoint with local model
*/
@@ -224,14 +224,14 @@ void AllreduceRobust::LocalModelCheck(bool with_local) {
num_local_replica = 0;
}
} else {
utils::Check(use_local_model == int(with_local),
utils::Check(use_local_model == static_cast<int>(with_local),
"Can only call Checkpoint/LoadCheckPoint always with"\
"or without local_model, but not mixed case");
}
}
/*!
* \brief internal implementation of checkpoint, support both lazy and normal way
*
*
* \param global_model pointer to the globally shared model/state
* when calling this function, the caller need to gauranttees that global_model
* is the same in all nodes
@@ -423,7 +423,7 @@ AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
* recover links according to the error type reported
* if there is no error, return true
* \param err_type the type of error happening in the system
* \return true if err_type is kSuccess, false otherwise
* \return true if err_type is kSuccess, false otherwise
*/
bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
if (err_type == kSuccess) return true;
@@ -488,7 +488,7 @@ ShortestDist(const std::pair<bool, size_t> &node_value,
* \brief message passing function, used to decide the
* data request from each edge, whether need to request data from certain edge
* \param node_value a pair of request_data and best_link
* request_data stores whether current node need to request data
* request_data stores whether current node need to request data
* best_link gives the best edge index to fetch the data
* \param req_in the data request from incoming edges
* \param out_index the edge index of output link
@@ -524,7 +524,7 @@ inline char DataRequest(const std::pair<bool, int> &node_value,
*
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
* \sa ReturnType
*/
*/
AllreduceRobust::ReturnType
AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
size_t *p_size,
@@ -586,7 +586,7 @@ AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
*
* \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
* \sa ReturnType, TryDecideRouting
*/
*/
AllreduceRobust::ReturnType
AllreduceRobust::TryRecoverData(RecoverType role,
void *sendrecvbuf_,
@@ -644,7 +644,7 @@ AllreduceRobust::TryRecoverData(RecoverType role,
if (role == kRequestData) {
const int pid = recv_link;
if (selecter.CheckRead(links[pid].sock)) {
ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);
ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);
if (ret != kSuccess) {
return ReportError(&links[pid], ret);
}
@@ -823,10 +823,10 @@ AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool re
* \param buf the buffer to store the result
* \param size the total size of the buffer
* \param flag flag information about the action \sa ActionSummary
* \param seqno sequence number of the action, if it is special action with flag set,
* \param seqno sequence number of the action, if it is special action with flag set,
* seqno needs to be set to ActionSummary::kSpecialOp
*
* \return if this function can return true or false
* \return if this function can return true or false
* - true means buf already set to the
* result by recovering procedure, the action is complete, no further action is needed
* - false means this is the lastest action that has not yet been executed, need to execute the action
@@ -907,7 +907,7 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
* plus replication of states in previous num_local_replica hops in the ring
*
* The input parameters must contain the valid local states available in current nodes,
* This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
* This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
* If there is sufficient information in the ring, when the function returns, local_chkpt will
* contain num_local_replica + 1 checkpoints (including the chkpt of this node)
* If there is no sufficient information in the ring, this function the number of checkpoints

View File

@@ -5,7 +5,7 @@
* using TCP non-block socket and tree-shape reduction.
*
* This implementation considers the failure of nodes
*
*
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
*/
#ifndef RABIT_ALLREDUCE_ROBUST_H_
@@ -28,13 +28,13 @@ class AllreduceRobust : public AllreduceBase {
/*! \brief shutdown the engine */
virtual void Shutdown(void);
/*!
* \brief set parameters to the engine
* \brief set parameters to the engine
* \param name parameter name
* \param val parameter value
*/
virtual void SetParam(const char *name, const char *val);
/*!
* \brief perform in-place allreduce, on sendrecvbuf
* \brief perform in-place allreduce, on sendrecvbuf
* this function is NOT thread-safe
* \param sendrecvbuf_ buffer for both sending and recving data
* \param type_nbytes the unit number of bytes the type have
@@ -69,14 +69,14 @@ class AllreduceRobust : public AllreduceBase {
* \return the version number of check point loaded
* if returned version == 0, this means no model has been CheckPointed
* the p_model is not touched, user should do necessary initialization by themselves
*
*
* Common usage example:
* int iter = rabit::LoadCheckPoint(&model);
* if (iter == 0) model.InitParameters();
* for (i = iter; i < max_iter; ++i) {
* do many things, include allreduce
* rabit::CheckPoint(model);
* }
* }
*
* \sa CheckPoint, VersionNumber
*/
@@ -85,7 +85,7 @@ class AllreduceRobust : public AllreduceBase {
/*!
* \brief checkpoint the model, meaning we finished a stage of execution
* every time we call check point, there is a version number which will increase by one
*
*
* \param global_model pointer to the globally shared model/state
* when calling this function, the caller need to gauranttees that global_model
* is the same in all nodes
@@ -105,16 +105,16 @@ class AllreduceRobust : public AllreduceBase {
/*!
* \brief This function can be used to replace CheckPoint for global_model only,
* when certain condition is met(see detailed expplaination).
*
*
* This is a "lazy" checkpoint such that only the pointer to global_model is
* remembered and no memory copy is taken. To use this function, the user MUST ensure that:
* The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
* In another words, global_model model can be changed only between last call of
* In another words, global_model model can be changed only between last call of
* Allreduce/Broadcast and LazyCheckPoint in current version
*
*
* For example, suppose the calling sequence is:
* LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
*
*
* If user can only changes global_model in code3, then LazyCheckPoint can be used to
* improve efficiency of the program.
* \param global_model pointer to the globally shared model/state
@@ -287,6 +287,7 @@ class AllreduceRobust : public AllreduceBase {
if (seqno_.size() == 0) return -1;
return seqno_.back();
}
private:
// sequence number of each
std::vector<int> seqno_;
@@ -301,14 +302,14 @@ class AllreduceRobust : public AllreduceBase {
* \brief internal consistency check function,
* use check to ensure user always call CheckPoint/LoadCheckPoint
* with or without local but not both, this function will set the approperiate settings
* in the first call of LoadCheckPoint/CheckPoint
* in the first call of LoadCheckPoint/CheckPoint
*
* \param with_local whether the user calls CheckPoint with local model
*/
void LocalModelCheck(bool with_local);
/*!
* \brief internal implementation of checkpoint, support both lazy and normal way
*
*
* \param global_model pointer to the globally shared model/state
* when calling this function, the caller need to gauranttees that global_model
* is the same in all nodes
@@ -326,10 +327,10 @@ class AllreduceRobust : public AllreduceBase {
* after this function finishes, all the messages received and sent
* before in all live links are discarded,
* This allows us to get a fresh start after error has happened
*
*
* TODO(tqchen): this function is not yet functioning was not used by engine,
* simple resetlink and reconnect strategy is used
*
*
* \return this function can return kSuccess or kSockError
* when kSockError is returned, it simply means there are bad sockets in the links,
* and some link recovery proceduer is needed
@@ -340,7 +341,7 @@ class AllreduceRobust : public AllreduceBase {
* recover links according to the error type reported
* if there is no error, return true
* \param err_type the type of error happening in the system
* \return true if err_type is kSuccess, false otherwise
* \return true if err_type is kSuccess, false otherwise
*/
bool CheckAndRecover(ReturnType err_type);
/*!
@@ -355,7 +356,7 @@ class AllreduceRobust : public AllreduceBase {
* \param seqno sequence number of the action, if it is special action with flag set,
* seqno needs to be set to ActionSummary::kSpecialOp
*
* \return if this function can return true or false
* \return if this function can return true or false
* - true means buf already set to the
* result by recovering procedure, the action is complete, no further action is needed
* - false means this is the lastest action that has not yet been executed, need to execute the action
@@ -364,7 +365,7 @@ class AllreduceRobust : public AllreduceBase {
int seqno = ActionSummary::kSpecialOp);
/*!
* \brief try to load check point
*
*
* This is a collaborative function called by all nodes
* only the nodes with requester set to true really needs to load the check point
* other nodes acts as collaborative roles to complete this request
@@ -395,7 +396,7 @@ class AllreduceRobust : public AllreduceBase {
* \param p_size used to store the size of the message, for node in state kHaveData,
* this size must be set correctly before calling the function
* for others, this surves as output parameter
* \param p_recvlink used to store the link current node should recv data from, if necessary
* this can be -1, which means current node have the data
* \param p_req_in used to store the resulting vector, indicating which link we should send the data to
@@ -432,7 +433,7 @@ class AllreduceRobust : public AllreduceBase {
* plus replication of states in previous num_local_replica hops in the ring
*
* The input parameters must contain the valid local states available in current nodes,
* This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
* This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
* If there is sufficient information in the ring, when the function returns, local_chkpt will
* contain num_local_replica + 1 checkpoints (including the chkpt of this node)
* If there is no sufficient information in the ring, this function the number of checkpoints
@@ -487,7 +488,7 @@ o * the input state must exactly one saved state(local state of current node)
LinkRecord *read_link,
LinkRecord *write_link);
/*!
* \brief run message passing algorithm on the allreduce tree
* \brief run message passing algorithm on the allreduce tree
* the result is edge message stored in p_edge_in and p_edge_out
* \param node_value the value associated with current node
* \param p_edge_in used to store input message from each of the edge
@@ -509,7 +510,7 @@ o * the input state must exactly one saved state(local state of current node)
inline ReturnType MsgPassing(const NodeType &node_value,
std::vector<EdgeType> *p_edge_in,
std::vector<EdgeType> *p_edge_out,
EdgeType (*func)
EdgeType(*func)
(const NodeType &node_value,
const std::vector<EdgeType> &edge_in,
size_t out_index));

View File

@@ -3,7 +3,7 @@
* \file engine.cc
* \brief this file governs which implementation of engine we are actually using
* provides an singleton of engine interface
*
*
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
*/
#define _CRT_SECURE_NO_WARNINGS
@@ -60,7 +60,7 @@ void Allreduce_(void *sendrecvbuf,
}
// code for reduce handle
ReduceHandle::ReduceHandle(void)
ReduceHandle::ReduceHandle(void)
: handle_(NULL), redfunc_(NULL), htype_(NULL) {
}
ReduceHandle::~ReduceHandle(void) {}

View File

@@ -3,7 +3,7 @@
* \file engine_mpi.cc
* \brief this file gives an implementation of engine interface using MPI,
* this will allow rabit program to run with MPI, but do not comes with fault tolerant
*
*
* \author Tianqi Chen
*/
#define _CRT_SECURE_NO_WARNINGS
@@ -143,7 +143,7 @@ void Allreduce_(void *sendrecvbuf,
}
// code for reduce handle
ReduceHandle::ReduceHandle(void)
ReduceHandle::ReduceHandle(void)
: handle_(NULL), redfunc_(NULL), htype_(NULL) {
}
ReduceHandle::~ReduceHandle(void) {
@@ -166,7 +166,7 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
if (type_nbytes != 0) {
MPI::Datatype *dtype = new MPI::Datatype();
if (type_nbytes % 8 == 0) {
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long)); // NOLINT(*)
} else if (type_nbytes % 4 == 0) {
*dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
} else {
@@ -195,7 +195,7 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
dtype->Free();
}
if (type_nbytes % 8 == 0) {
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long)); // NOLINT(*)
} else if (type_nbytes % 4 == 0) {
*dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
} else {

View File

@@ -51,7 +51,7 @@ struct SockAddr {
utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
return std::string(buf.c_str());
}
/*!
/*!
* \brief set the address
* \param url the url of the address
* \param port the port of address
@@ -83,7 +83,7 @@ struct SockAddr {
}
};
/*!
/*!
* \brief base class containing common operations of TCP and UDP sockets
*/
class Socket {
@@ -95,7 +95,7 @@ class Socket {
return sockfd;
}
/*!
* \return last error of socket operation
* \return last error of socket operation
*/
inline static int GetLastError(void) {
#ifdef _WIN32
@@ -106,7 +106,7 @@ class Socket {
}
/*! \return whether last error was would block */
inline static bool LastErrorWouldBlock(void) {
int errsv = GetLastError();
int errsv = GetLastError();
#ifdef _WIN32
return errsv == WSAEWOULDBLOCK;
#else
@@ -129,15 +129,15 @@ class Socket {
}
#endif
}
/*!
/*!
* \brief shutdown the socket module after use, all sockets need to be closed
*/
*/
inline static void Finalize(void) {
#ifdef _WIN32
WSACleanup();
#endif
}
/*!
/*!
* \brief set this socket to use non-blocking mode
* \param non_block whether set it to be non-block, if it is false
* it will set it back to block mode
@@ -163,8 +163,8 @@ class Socket {
}
#endif
}
/*!
* \brief bind the socket to an address
/*!
* \brief bind the socket to an address
* \param addr
*/
inline void Bind(const SockAddr &addr) {
@@ -173,7 +173,7 @@ class Socket {
Socket::Error("Bind");
}
}
/*!
/*!
* \brief try bind the socket to host, from start_port to end_port
* \param start_port starting port number to try
* \param end_port ending port number to try
@@ -188,11 +188,11 @@ class Socket {
return port;
}
#if defined(_WIN32)
if (WSAGetLastError() != WSAEADDRINUSE) {
Socket::Error("TryBindHost");
}
if (WSAGetLastError() != WSAEADDRINUSE) {
Socket::Error("TryBindHost");
}
#else
if (errno != EADDRINUSE) {
if (errno != EADDRINUSE) {
Socket::Error("TryBindHost");
}
#endif
@@ -248,7 +248,7 @@ class Socket {
}
};
/*!
/*!
* \brief a wrapper of TCP socket that hopefully be cross platform
*/
class TCPSocket : public Socket{
@@ -261,10 +261,11 @@ class TCPSocket : public Socket{
/*!
* \brief enable/disable TCP keepalive
* \param keepalive whether to set the keep alive option on
*/
*/
inline void SetKeepAlive(bool keepalive) {
int opt = static_cast<int>(keepalive);
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
Socket::Error("SetKeepAlive");
}
}
@@ -294,12 +295,12 @@ class TCPSocket : public Socket{
return TCPSocket(newfd);
}
/*!
* \brief decide whether the socket is at OOB mark
* \brief decide whether the socket is at OOB mark
* \return 1 if at mark, 0 if not, -1 if an error occured
*/
inline int AtMark(void) const {
#ifdef _WIN32
unsigned long atmark;
unsigned long atmark; // NOLINT(*)
if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
#else
int atmark;
@@ -307,8 +308,8 @@ class TCPSocket : public Socket{
#endif
return static_cast<int>(atmark);
}
/*!
* \brief connect to an address
/*!
* \brief connect to an address
* \param addr the address to connect to
* \return whether connect is successful
*/
@@ -328,8 +329,8 @@ class TCPSocket : public Socket{
const char *buf = reinterpret_cast<const char*>(buf_);
return send(sockfd, buf, static_cast<sock_size_t>(len), flag);
}
/*!
* \brief receive data using the socket
/*!
* \brief receive data using the socket
* \param buf_ the pointer to the buffer
* \param len the size of the buffer
* \param flags extra flags
@@ -385,7 +386,7 @@ class TCPSocket : public Socket{
return ndone;
}
/*!
* \brief send a string over network
* \brief send a string over network
* \param str the string to be sent
*/
inline void SendStr(const std::string &str) {
@@ -423,7 +424,7 @@ struct SelectHelper {
maxfd = 0;
}
/*!
* \brief add file descriptor to watch for read
* \brief add file descriptor to watch for read
* \param fd file descriptor to be watched
*/
inline void WatchRead(SOCKET fd) {
@@ -473,7 +474,7 @@ struct SelectHelper {
* \param timeout the timeout counter, can be 0, which means wait until the event happen
* \return 1 if success, 0 if timeout, and -1 if error occurs
*/
inline static int WaitExcept(SOCKET fd, long timeout = 0) {
inline static int WaitExcept(SOCKET fd, long timeout = 0) { // NOLINT(*)
fd_set wait_set;
FD_ZERO(&wait_set);
FD_SET(fd, &wait_set);
@@ -486,10 +487,10 @@ struct SelectHelper {
* \param select_write whether to watch for write event
* \param select_except whether to watch for exception event
* \param timeout specify timeout in micro-seconds(ms) if equals 0, means select will always block
* \return number of active descriptors selected,
* \return number of active descriptors selected,
* return -1 if error occurs
*/
inline int Select(long timeout = 0) {
inline int Select(long timeout = 0) { // NOLINT(*)
int ret = Select_(static_cast<int>(maxfd + 1),
&read_set, &write_set, &except_set, timeout);
if (ret == -1) {
@@ -500,7 +501,7 @@ struct SelectHelper {
private:
inline static int Select_(int maxfd, fd_set *rfds,
fd_set *wfds, fd_set *efds, long timeout) {
fd_set *wfds, fd_set *efds, long timeout) { // NOLINT(*)
#if !defined(_WIN32)
utils::Assert(maxfd < FD_SETSIZE, "maxdf must be smaller than FDSETSIZE");
#endif