allow not stop process in error (#97)

* allow not stop process in error

* fix merge error
This commit is contained in:
Nan Zhu 2019-06-25 13:04:39 -07:00 committed by GitHub
parent a429748e24
commit fc85f776f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 4 deletions

View File

@ -10,6 +10,7 @@
#include <cstdio>
#include <string>
#include <cstdlib>
#include <stdexcept>
#include <vector>
#ifndef RABIT_STRICT_CXX98_
@ -61,22 +62,36 @@ namespace utils {
/*! \brief error message buffer length */
const int kPrintBuffer = 1 << 12;
/*! \brief we may want to keep the process alive when there are multiple workers
* co-locate in the same process */
extern bool STOP_PROCESS_ON_ERROR;
#ifndef RABIT_CUSTOMIZE_MSG_
/*!
* \brief handling of Assert error, caused by inappropriate input
* \param msg error message
*/
inline void HandleAssertError(const char *msg) {
fprintf(stderr, "AssertError:%s\n", msg);
exit(-1);
if (STOP_PROCESS_ON_ERROR) {
fprintf(stderr, "AssertError:%s, shutting down process\n", msg);
exit(-1);
} else {
fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg);
throw std::runtime_error(msg);
}
}
/*!
* \brief handling of Check error, caused by inappropriate input
* \param msg error message
*/
inline void HandleCheckError(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(-1);
if (STOP_PROCESS_ON_ERROR) {
fprintf(stderr, "%s, shutting down process", msg);
exit(-1);
} else {
fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);
throw std::runtime_error(msg);
}
}
inline void HandlePrint(const char *msg) {
printf("%s", msg);

View File

@ -14,6 +14,11 @@
#include "./allreduce_base.h"
namespace rabit {
namespace utils {
bool STOP_PROCESS_ON_ERROR = true;
}
namespace engine {
// constructor
AllreduceBase::AllreduceBase(void) {
@ -48,6 +53,7 @@ AllreduceBase::AllreduceBase(void) {
env_vars.push_back("DMLC_TRACKER_URI");
env_vars.push_back("DMLC_TRACKER_PORT");
env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR");
}
// initialization function
@ -190,6 +196,15 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
connect_retry = atoi(val);
}
if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) {
if (!strcmp(val, "true")) {
rabit::utils::STOP_PROCESS_ON_ERROR = true;
} else if (!strcmp(val, "false")) {
rabit::utils::STOP_PROCESS_ON_ERROR = false;
} else {
throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR");
}
}
}
/*!
* \brief initialize connection to the tracker

View File

@ -13,6 +13,11 @@
#include "../include/rabit/internal/engine.h"
namespace rabit {
namespace utils {
bool STOP_PROCESS_ON_ERROR = true;
}
namespace engine {
/*! \brief EmptyEngine */
class EmptyEngine : public IEngine {

View File

@ -15,6 +15,11 @@
#include "../include/rabit/internal/utils.h"
namespace rabit {
namespace utils {
bool STOP_PROCESS_ON_ERROR = true;
}
namespace engine {
/*! \brief implementation of engine using MPI */
class MPIEngine : public IEngine {