allow not stop process in error (#97)
* allow not stop process in error * fix merge error
This commit is contained in:
parent
a429748e24
commit
fc85f776f4
@ -10,6 +10,7 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <cstdlib>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#ifndef RABIT_STRICT_CXX98_
|
||||
@ -61,22 +62,36 @@ namespace utils {
|
||||
/*! \brief error message buffer length */
|
||||
const int kPrintBuffer = 1 << 12;
|
||||
|
||||
/*! \brief we may want to keep the process alive when there are multiple workers
|
||||
* co-locate in the same process */
|
||||
extern bool STOP_PROCESS_ON_ERROR;
|
||||
|
||||
#ifndef RABIT_CUSTOMIZE_MSG_
|
||||
/*!
|
||||
* \brief handling of Assert error, caused by inappropriate input
|
||||
* \param msg error message
|
||||
*/
|
||||
inline void HandleAssertError(const char *msg) {
|
||||
fprintf(stderr, "AssertError:%s\n", msg);
|
||||
exit(-1);
|
||||
if (STOP_PROCESS_ON_ERROR) {
|
||||
fprintf(stderr, "AssertError:%s, shutting down process\n", msg);
|
||||
exit(-1);
|
||||
} else {
|
||||
fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief handling of Check error, caused by inappropriate input
|
||||
* \param msg error message
|
||||
*/
|
||||
inline void HandleCheckError(const char *msg) {
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
exit(-1);
|
||||
if (STOP_PROCESS_ON_ERROR) {
|
||||
fprintf(stderr, "%s, shutting down process", msg);
|
||||
exit(-1);
|
||||
} else {
|
||||
fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
}
|
||||
inline void HandlePrint(const char *msg) {
|
||||
printf("%s", msg);
|
||||
|
||||
@ -14,6 +14,11 @@
|
||||
#include "./allreduce_base.h"
|
||||
|
||||
namespace rabit {
|
||||
|
||||
namespace utils {
|
||||
bool STOP_PROCESS_ON_ERROR = true;
|
||||
}
|
||||
|
||||
namespace engine {
|
||||
// constructor
|
||||
AllreduceBase::AllreduceBase(void) {
|
||||
@ -48,6 +53,7 @@ AllreduceBase::AllreduceBase(void) {
|
||||
env_vars.push_back("DMLC_TRACKER_URI");
|
||||
env_vars.push_back("DMLC_TRACKER_PORT");
|
||||
env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
|
||||
env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR");
|
||||
}
|
||||
|
||||
// initialization function
|
||||
@ -190,6 +196,15 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
|
||||
if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
|
||||
connect_retry = atoi(val);
|
||||
}
|
||||
if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) {
|
||||
if (!strcmp(val, "true")) {
|
||||
rabit::utils::STOP_PROCESS_ON_ERROR = true;
|
||||
} else if (!strcmp(val, "false")) {
|
||||
rabit::utils::STOP_PROCESS_ON_ERROR = false;
|
||||
} else {
|
||||
throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR");
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief initialize connection to the tracker
|
||||
|
||||
@ -13,6 +13,11 @@
|
||||
#include "../include/rabit/internal/engine.h"
|
||||
|
||||
namespace rabit {
|
||||
|
||||
namespace utils {
|
||||
bool STOP_PROCESS_ON_ERROR = true;
|
||||
}
|
||||
|
||||
namespace engine {
|
||||
/*! \brief EmptyEngine */
|
||||
class EmptyEngine : public IEngine {
|
||||
|
||||
@ -15,6 +15,11 @@
|
||||
#include "../include/rabit/internal/utils.h"
|
||||
|
||||
namespace rabit {
|
||||
|
||||
namespace utils {
|
||||
bool STOP_PROCESS_ON_ERROR = true;
|
||||
}
|
||||
|
||||
namespace engine {
|
||||
/*! \brief implementation of engine using MPI */
|
||||
class MPIEngine : public IEngine {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user