From eb2590b7742c06a6b99ed319c7ca5629fc1dc058 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Fri, 26 Oct 2018 12:39:31 -0700 Subject: [PATCH] workaround macosx java test race condition (#74) * fix error in dmlc#57, clean up comments and naming * include missing packages, disable recovery tests for now * disable local_recover tests until we have a bug fix * support larger cluster * fix lint, merge with master * fix mac osx test failure in https://github.com/dmlc/xgboost/pull/3818 * Update allreduce_robust.cc --- src/allreduce_robust.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index 210d5d8a3..db829eaa5 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -50,6 +50,12 @@ void AllreduceRobust::Shutdown(void) { // execute check ack step, load happens here utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp), "Shutdown: check ack must return true"); +#ifdef __APPLE__ + // In OSX, one worker shutdowns and closes sockets while rest still run kCheckAck + // This cause rest workers checkandrecover and hang inf, https://github.com/dmlc/xgboost/pull/3818 + // TODO: a fundamental fix for this + sleep(2); +#endif AllreduceBase::Shutdown(); } /*!