exit when allreduce/broadcast error cause timeout (#112)

* keep async timeout task

* add missing pthread to cmake

* add tests

* Add a sleep period to avoid flushing the tracker.
This commit is contained in:
Chen Qin
2019-10-11 00:39:39 -07:00
committed by Jiaming Yuan
parent af7281afe3
commit 5d1b613910
17 changed files with 403 additions and 71 deletions

View File

@@ -7,6 +7,7 @@
#ifndef RABIT_INTERNAL_UTILS_H_
#define RABIT_INTERNAL_UTILS_H_
#define _CRT_SECURE_NO_WARNINGS
#include <string.h>
#include <cstdio>
#include <string>
#include <cstdlib>
@@ -66,6 +67,11 @@ const int kPrintBuffer = 1 << 12;
* co-locate in the same process */
extern bool STOP_PROCESS_ON_ERROR;
/* \brief parse config string too bool*/
inline bool StringToBool(const char* s) {
return strcasecmp(s, "true") == 0 || atoi(s) != 0;
}
#ifndef RABIT_CUSTOMIZE_MSG_
/*!
* \brief handling of Assert error, caused by inappropriate input
@@ -86,7 +92,7 @@ inline void HandleAssertError(const char *msg) {
*/
inline void HandleCheckError(const char *msg) {
if (STOP_PROCESS_ON_ERROR) {
fprintf(stderr, "%s, shutting down process", msg);
fprintf(stderr, "%s, shutting down process\n", msg);
exit(-1);
} else {
fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);