From 3f22596e3c22d53a15e05e2f6b01541a887dcc3f Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 9 Dec 2014 20:57:54 -0800 Subject: [PATCH] check in license --- LICENSE | 13 +++++++++++++ src/allreduce_robust.cc | 15 ++++++++++----- test/test_local_recover.cpp | 2 +- test/test_model_recover.cpp | 4 ++-- 4 files changed, 26 insertions(+), 8 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..ebf9611d7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright (c) 2014 by Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index 1de92b7d6..99906fdc6 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -20,6 +20,7 @@ AllreduceRobust::AllreduceRobust(void) { result_buffer_round = 1; num_local_replica = 0; seq_counter = 0; + local_chkpt_version = 0; } /*! \brief shutdown the engine */ void AllreduceRobust::Shutdown(void) { @@ -619,16 +620,16 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) { // check in local data RecoverType role = requester ? kRequestData : kHaveData; ReturnType succ; - if (num_local_replica != 0) { + if (false) { if (requester) { // clear existing history, if any, before load local_rptr[local_chkpt_version].clear(); local_chkpt[local_chkpt_version].clear(); } // recover local checkpoint - succ = TryRecoverLocalState(&local_rptr[local_chkpt_version], - &local_chkpt[local_chkpt_version]); - if (succ != kSuccess) return succ; + //succ = TryRecoverLocalState(&local_rptr[local_chkpt_version], + //m&local_chkpt[local_chkpt_version]); + //if (succ != kSuccess) return succ; int nlocal = std::max(static_cast(local_rptr[local_chkpt_version].size()) - 1, 0); // check if everyone is OK unsigned state = 0; @@ -817,7 +818,8 @@ AllreduceRobust::TryRecoverLocalState(std::vector *p_local_rptr, utils::Assert(chkpt.length() == 0, "local chkpt space inconsistent"); } const int n = num_local_replica; - {// backward passing, passing state in backward direction of the ring + utils::LogPrintf("[%d] backward!!\n", rabit::GetRank()); + if(false){// backward passing, passing state in backward direction of the ring const int nlocal = static_cast(rptr.size() - 1); utils::Assert(nlocal <= n + 1, "invalid local replica"); std::vector msg_back(n + 1); @@ -870,6 +872,8 @@ AllreduceRobust::TryRecoverLocalState(std::vector *p_local_rptr, rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ; } } + + utils::LogPrintf("[%d] FORward!!\n", rabit::GetRank()); {// forward passing, passing state in forward direction of the ring const int nlocal = static_cast(rptr.size() - 1); utils::Assert(nlocal <= n + 1, "invalid local replica"); @@ -933,6 +937,7 @@ AllreduceRobust::TryRecoverLocalState(std::vector *p_local_rptr, rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ; } } + utils::LogPrintf("[%d] Finished!!\n", rabit::GetRank()); return kSuccess; } /*! diff --git a/test/test_local_recover.cpp b/test/test_local_recover.cpp index 87262ba7b..27d4541a4 100644 --- a/test/test_local_recover.cpp +++ b/test/test_local_recover.cpp @@ -41,7 +41,7 @@ inline void TestMax(test::Mock &mock, Model *model, Model *local, int ntrial, in } mock.Allreduce(&ndata[0], ndata.size()); if (ntrial == iter && rank == 3) { - //exit(-1); + throw MockException(); } for (size_t i = 0; i < ndata.size(); ++i) { float rmax = (i * 1) % z + model->data[i]; diff --git a/test/test_model_recover.cpp b/test/test_model_recover.cpp index 2b72cde75..ae5369dc2 100644 --- a/test/test_model_recover.cpp +++ b/test/test_model_recover.cpp @@ -41,7 +41,7 @@ inline void TestMax(test::Mock &mock, Model *model, int ntrial, int iter) { } mock.Allreduce(&ndata[0], ndata.size()); if (ntrial == iter && rank == 3) { - exit(-1); + // exit(-1); } for (size_t i = 0; i < ndata.size(); ++i) { float rmax = (i * 1) % z + model->data[i]; @@ -65,7 +65,7 @@ inline void TestSum(test::Mock &mock, Model *model, int ntrial, int iter) { mock.Allreduce(&ndata[0], ndata.size()); if (ntrial == iter && rank == 0) { - exit(-1); + throw MockException(); } for (size_t i = 0; i < ndata.size(); ++i) {