before make rabit public
This commit is contained in:
parent
cc410b8c90
commit
821eb21ae2
@ -2,7 +2,7 @@
|
||||
|
||||
rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementation of distributed machine learning programs, many of which sits naturally under Allreduce abstraction.
|
||||
|
||||
Contributors: https://github.com/tqchen/rabit/graphs/contributors
|
||||
Interface: [rabit.h](src/rabit.h)
|
||||
|
||||
Features
|
||||
====
|
||||
@ -27,4 +27,3 @@ Design Goal
|
||||
* rabit should run fast
|
||||
* rabit is light weight
|
||||
* rabit dig safe burrows to avoid disasters
|
||||
|
||||
|
||||
@ -137,7 +137,7 @@ int AllreduceRobust::LoadCheckPoint(utils::ISerializable *global_model,
|
||||
// reset result buffer
|
||||
resbuf.Clear(); seq_counter = 0;
|
||||
// load from buffer
|
||||
utils::MemoryBufferStream fs(&mglobal_model);
|
||||
utils::MemoryBufferStream fs(&global_checkpoint);
|
||||
fs.Read(&version_number, sizeof(version_number));
|
||||
if (version_number == 0) return version_number;
|
||||
global_model->Load(fs);
|
||||
@ -174,11 +174,12 @@ void AllreduceRobust::CheckPoint(const utils::ISerializable *global_model,
|
||||
// execute checkpoint, note: when checkpoint existing, load will not happen
|
||||
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kMaxSeq),
|
||||
"check point must return true");
|
||||
// this is the critical region where we will change all the stored models
|
||||
// increase version number
|
||||
version_number += 1;
|
||||
// save model
|
||||
mglobal_model.resize(0);
|
||||
utils::MemoryBufferStream fs(&mglobal_model);
|
||||
global_checkpoint.resize(0);
|
||||
utils::MemoryBufferStream fs(&global_checkpoint);
|
||||
fs.Write(&version_number, sizeof(version_number));
|
||||
global_model->Save(fs);
|
||||
// reset result buffer
|
||||
@ -580,16 +581,16 @@ AllreduceRobust::TryRecoverData(RecoverType role,
|
||||
*/
|
||||
AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
|
||||
RecoverType role = requester ? kRequestData : kHaveData;
|
||||
size_t size = this->mglobal_model.length();
|
||||
size_t size = this->global_checkpoint.length();
|
||||
int recv_link;
|
||||
std::vector<bool> req_in;
|
||||
ReturnType succ = TryDecideRouting(role, &size, &recv_link, &req_in);
|
||||
if (succ != kSuccess) return succ;
|
||||
if (role == kRequestData) {
|
||||
mglobal_model.resize(size);
|
||||
global_checkpoint.resize(size);
|
||||
}
|
||||
if (size == 0) return kSuccess;
|
||||
return TryRecoverData(role, &mglobal_model[0], size, recv_link, req_in);
|
||||
return TryRecoverData(role, &global_checkpoint[0], size, recv_link, req_in);
|
||||
}
|
||||
/*!
|
||||
* \brief try to get the result of operation specified by seqno
|
||||
|
||||
@ -410,7 +410,7 @@ class AllreduceRobust : public AllreduceBase {
|
||||
// result buffer of all reduce
|
||||
ResultBuffer resbuf;
|
||||
// last check point global model
|
||||
std::string mglobal_model;
|
||||
std::string global_checkpoint;
|
||||
// number of replica for local state/model
|
||||
int num_local_replica;
|
||||
// pointer to memory position in the local model
|
||||
@ -419,9 +419,9 @@ class AllreduceRobust : public AllreduceBase {
|
||||
// local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops in the ring
|
||||
std::vector<size_t> local_rptr;
|
||||
// storage for local model replicas
|
||||
std::string mlocal_model;
|
||||
// temporal storage
|
||||
std::string tmp_local_model;
|
||||
std::string local_checkpoint;
|
||||
// temporal storage for doing local checkpointing
|
||||
std::string tmp_local_check;
|
||||
};
|
||||
} // namespace engine
|
||||
} // namespace rabit
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user