[rabit harden] fix rabit tests (#81)
* enable model recovery tests * force use gcc4.8 in Travis
This commit is contained in:
@@ -268,8 +268,9 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
|
||||
if (num_local_replica != 0) {
|
||||
while (true) {
|
||||
if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
|
||||
// save model model to new version place
|
||||
// save model to new version place
|
||||
int new_version = !local_chkpt_version;
|
||||
|
||||
local_chkpt[new_version].clear();
|
||||
utils::MemoryBufferStream fs(&local_chkpt[new_version]);
|
||||
if (local_model != NULL) {
|
||||
@@ -296,6 +297,7 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
|
||||
if (lazy_checkpt) {
|
||||
global_lazycheck = global_model;
|
||||
} else {
|
||||
printf("[%d] save global checkpoint #%d \n", this->rank, version_number);
|
||||
global_checkpoint.resize(0);
|
||||
utils::MemoryBufferStream fs(&global_checkpoint);
|
||||
fs.Write(&version_number, sizeof(version_number));
|
||||
@@ -737,6 +739,9 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
|
||||
succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
|
||||
&local_chkpt[local_chkpt_version]);
|
||||
if (succ != kSuccess) return succ;
|
||||
|
||||
printf("[%d] recovered from local checkpoint version %d \n", this->rank, local_chkpt_version);
|
||||
|
||||
int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
|
||||
// check if everyone is OK
|
||||
unsigned state = 0;
|
||||
|
||||
Reference in New Issue
Block a user