From 821eb21ae28a5f1c4acb6dad8646768b31652138 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 4 Dec 2014 17:30:58 -0800 Subject: [PATCH] before make rabit public --- README.md | 3 +-- src/allreduce_robust.cc | 15 ++++++++------- src/allreduce_robust.h | 10 +++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index eac939317..a7b333eb8 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementation of distributed machine learning programs, many of which sits naturally under Allreduce abstraction. -Contributors: https://github.com/tqchen/rabit/graphs/contributors +Interface: [rabit.h](src/rabit.h) Features ==== @@ -27,4 +27,3 @@ Design Goal * rabit should run fast * rabit is light weight * rabit dig safe burrows to avoid disasters - diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index a878f5618..3232ef40c 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -137,7 +137,7 @@ int AllreduceRobust::LoadCheckPoint(utils::ISerializable *global_model, // reset result buffer resbuf.Clear(); seq_counter = 0; // load from buffer - utils::MemoryBufferStream fs(&mglobal_model); + utils::MemoryBufferStream fs(&global_checkpoint); fs.Read(&version_number, sizeof(version_number)); if (version_number == 0) return version_number; global_model->Load(fs); @@ -155,7 +155,7 @@ int AllreduceRobust::LoadCheckPoint(utils::ISerializable *global_model, /*! * \brief checkpoint the model, meaning we finished a stage of execution * every time we call check point, there is a version number which will increase by one - * + * * \param global_model pointer to the globally shared model/state * when calling this function, the caller need to gauranttees that global_model * is the same in all nodes @@ -174,11 +174,12 @@ void AllreduceRobust::CheckPoint(const utils::ISerializable *global_model, // execute checkpoint, note: when checkpoint existing, load will not happen utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kMaxSeq), "check point must return true"); + // this is the critical region where we will change all the stored models // increase version number version_number += 1; // save model - mglobal_model.resize(0); - utils::MemoryBufferStream fs(&mglobal_model); + global_checkpoint.resize(0); + utils::MemoryBufferStream fs(&global_checkpoint); fs.Write(&version_number, sizeof(version_number)); global_model->Save(fs); // reset result buffer @@ -580,16 +581,16 @@ AllreduceRobust::TryRecoverData(RecoverType role, */ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) { RecoverType role = requester ? kRequestData : kHaveData; - size_t size = this->mglobal_model.length(); + size_t size = this->global_checkpoint.length(); int recv_link; std::vector req_in; ReturnType succ = TryDecideRouting(role, &size, &recv_link, &req_in); if (succ != kSuccess) return succ; if (role == kRequestData) { - mglobal_model.resize(size); + global_checkpoint.resize(size); } if (size == 0) return kSuccess; - return TryRecoverData(role, &mglobal_model[0], size, recv_link, req_in); + return TryRecoverData(role, &global_checkpoint[0], size, recv_link, req_in); } /*! * \brief try to get the result of operation specified by seqno diff --git a/src/allreduce_robust.h b/src/allreduce_robust.h index d1018907c..45820a017 100644 --- a/src/allreduce_robust.h +++ b/src/allreduce_robust.h @@ -349,7 +349,7 @@ class AllreduceRobust : public AllreduceBase { void *sendrecvbuf_, size_t size, int recv_link, - const std::vector &req_in); + const std::vector &req_in); /*! * \brief perform a ring passing to receive data from prev link, and sent data to next link * this allows data to stream over a ring structure @@ -410,7 +410,7 @@ class AllreduceRobust : public AllreduceBase { // result buffer of all reduce ResultBuffer resbuf; // last check point global model - std::string mglobal_model; + std::string global_checkpoint; // number of replica for local state/model int num_local_replica; // pointer to memory position in the local model @@ -419,9 +419,9 @@ class AllreduceRobust : public AllreduceBase { // local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops in the ring std::vector local_rptr; // storage for local model replicas - std::string mlocal_model; - // temporal storage - std::string tmp_local_model; + std::string local_checkpoint; + // temporal storage for doing local checkpointing + std::string tmp_local_check; }; } // namespace engine } // namespace rabit