* support run rabit tests as xgboost subproject using xgboost/dmlc-core * support tracker config set/get * remove redudant printf * remove redudant printf * add c++0x declaration * log allreduce/broadcast caller, engine should track caller stack for investigation * tracker support binary config format * Revert "tracker support binary config format" This reverts commit 2a28e5e2b55c200cb621af8d19f17ab1bc62503b. * remove caller, prototype fetch allreduce/broadcast results from resbuf * store cached allreduce/broadcast seq_no to tracker * allow restore all caches from other nodes * try new rabit collective cache, todo: recv_link seems down * link up cache restore with main recovery * cleanup load cache state * update cache api * pass test.mk * have a working tests * try to unify check into actionsummary * more logging to debug distributed hist three method issue * update rabit interface to support caller signature matching * splite seq_counter from cur_cache_seq to different variables * still see issue with inf loop * support debug print caller as well as allreduce op * cleanup * remove get/set cache from model_recover, adding recover in loadcheckpoint * clarify rabit cache strategy, cache is set only by successful collective call involving all nodes with unique cache key. if all nodes call getcache at same time, we keep rabit run collective call. If some nodes call getcache while others not, we backfill cache from those nodes with most entries * revert caller logs * fix lint error * fix engine mpi signature * support getcache by ref * allow result buffer presiet to filestream * add loging * try fix checkpoint failure recovery case * use int64_t to avoid overflow caused seq fault * try avoid int overflow * try fix checkpoint failure recovery case * try avoid seqno overflow to negative by offseting specifial flag value adding cache seq no to checkpoint/load checkpoint/check point ack to avoid confusion from cache recovery * fix cache seq assert error * remove loging, handle edge case * add extensive log to checkpoint state with different seq no * fix lint errors * clean up comments before merge back to master * add logs to allreduce/broadcast/checkpoint * use unsinged int 32 and give seq no larger range * address remove allreduce dropseq code segment * using caller signature to filter bootstrapallreduces * remove get/set cache from empty * apply signature to reducer * apply signature to broadcast * add key to broadcat log * fix broadcast signature * fix default _line value for non linux system * adding comments, remove sleep(1) * fix osx build issue * try fix mpi * fix doc * fix engine_empty api * logging, adding more logs, restore immutable assertion * print unsinged int with ud * fix lint * rename seqtype to kSeq and KCache indicating it's usage apply kDiffSeq check to load_cache routine * comment allreduce/broadcast log * allow tests run on arm * enable flag to turn on / off cache * add log info alert if user choose to enable rabit bootstrap cache * add rabit_debug setting so user can use config to turn on * log flags when user turn on rabit_debug * force rabit restart if tracker assign -1 rank * use OPENMP to vecotrize reducer * address comment * Revert "address comment" This reverts commit 1dc61f33e7357dad8fa65528abeb81db92c5f9ed. * fix checkpoint size print 0 * per feedback, remove DISABLEOPEMP, address race condition * - remove openmp from this pr - update name from cache to boostrapcache * add default value of signature macros * remove openmp from cmake file * Update src/allreduce_robust.cc Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu> * Update src/allreduce_robust.cc Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu> * run test with cmake * remove openmp * fix cmake based tests * use cmake test fix darwin .dylib issue * move around rabit_signature definition due to windows build * misc, add c++ check in CMakeFile * per feedback * resolve CMake file * update rabit version
216 lines
5.8 KiB
C++
216 lines
5.8 KiB
C++
/*!
|
|
* Copyright (c) 2014 by Contributors
|
|
* \file utils.h
|
|
* \brief simple utils to support the code
|
|
* \author Tianqi Chen
|
|
*/
|
|
#ifndef RABIT_INTERNAL_UTILS_H_
|
|
#define RABIT_INTERNAL_UTILS_H_
|
|
#define _CRT_SECURE_NO_WARNINGS
|
|
#include <cstdio>
|
|
#include <string>
|
|
#include <cstdlib>
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
|
|
#ifndef RABIT_STRICT_CXX98_
|
|
#include <cstdarg>
|
|
#endif // RABIT_STRICT_CXX98_
|
|
|
|
#if !defined(__GNUC__) || defined(__FreeBSD__)
|
|
#define fopen64 std::fopen
|
|
#endif // !defined(__GNUC__) || defined(__FreeBSD__)
|
|
|
|
#ifdef _MSC_VER
|
|
// NOTE: sprintf_s is not equivalent to snprintf,
|
|
// they are equivalent when success, which is sufficient for our case
|
|
#define snprintf sprintf_s
|
|
#define vsnprintf vsprintf_s
|
|
|
|
#else
|
|
|
|
#ifdef _FILE_OFFSET_BITS
|
|
#if _FILE_OFFSET_BITS == 32
|
|
#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
|
|
#endif // _FILE_OFFSET_BITS == 32
|
|
#endif // _FILE_OFFSET_BITS
|
|
|
|
#ifdef __APPLE__
|
|
#define off64_t off_t
|
|
#define fopen64 std::fopen
|
|
#endif // __APPLE__
|
|
|
|
extern "C" {
|
|
#include <sys/types.h>
|
|
}
|
|
#endif // _MSC_VER
|
|
|
|
#ifdef _MSC_VER
|
|
typedef unsigned char uint8_t;
|
|
typedef unsigned __int16 uint16_t;
|
|
typedef unsigned __int32 uint32_t;
|
|
typedef unsigned __int64 uint64_t;
|
|
typedef __int64 int64_t;
|
|
#else
|
|
#include <inttypes.h>
|
|
#endif // _MSC_VER
|
|
|
|
namespace rabit {
|
|
/*! \brief namespace for helper utils of the project */
|
|
namespace utils {
|
|
|
|
/*! \brief error message buffer length */
|
|
const int kPrintBuffer = 1 << 12;
|
|
|
|
/*! \brief we may want to keep the process alive when there are multiple workers
|
|
* co-locate in the same process */
|
|
extern bool STOP_PROCESS_ON_ERROR;
|
|
|
|
#ifndef RABIT_CUSTOMIZE_MSG_
|
|
/*!
|
|
* \brief handling of Assert error, caused by inappropriate input
|
|
* \param msg error message
|
|
*/
|
|
inline void HandleAssertError(const char *msg) {
|
|
if (STOP_PROCESS_ON_ERROR) {
|
|
fprintf(stderr, "AssertError:%s, shutting down process\n", msg);
|
|
exit(-1);
|
|
} else {
|
|
fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg);
|
|
throw std::runtime_error(msg);
|
|
}
|
|
}
|
|
/*!
|
|
* \brief handling of Check error, caused by inappropriate input
|
|
* \param msg error message
|
|
*/
|
|
inline void HandleCheckError(const char *msg) {
|
|
if (STOP_PROCESS_ON_ERROR) {
|
|
fprintf(stderr, "%s, shutting down process", msg);
|
|
exit(-1);
|
|
} else {
|
|
fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);
|
|
throw std::runtime_error(msg);
|
|
}
|
|
}
|
|
inline void HandlePrint(const char *msg) {
|
|
printf("%s", msg);
|
|
}
|
|
|
|
inline void HandleLogInfo(const char *fmt, ...) {
|
|
std::string msg(kPrintBuffer, '\0');
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
|
va_end(args);
|
|
fprintf(stdout, "%s", msg.c_str());
|
|
fflush(stdout);
|
|
}
|
|
#else
|
|
#ifndef RABIT_STRICT_CXX98_
|
|
// include declarations, some one must implement this
|
|
void HandleAssertError(const char *msg);
|
|
void HandleCheckError(const char *msg);
|
|
void HandlePrint(const char *msg);
|
|
#endif // RABIT_STRICT_CXX98_
|
|
#endif // RABIT_CUSTOMIZE_MSG_
|
|
#ifdef RABIT_STRICT_CXX98_
|
|
// these function pointers are to be assigned
|
|
extern "C" void (*Printf)(const char *fmt, ...);
|
|
extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
|
|
extern "C" void (*Assert)(int exp, const char *fmt, ...);
|
|
extern "C" void (*Check)(int exp, const char *fmt, ...);
|
|
extern "C" void (*Error)(const char *fmt, ...);
|
|
#else
|
|
/*! \brief printf, prints messages to the console */
|
|
inline void Printf(const char *fmt, ...) {
|
|
std::string msg(kPrintBuffer, '\0');
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
|
va_end(args);
|
|
HandlePrint(msg.c_str());
|
|
}
|
|
/*! \brief portable version of snprintf */
|
|
inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
int ret = vsnprintf(buf, size, fmt, args);
|
|
va_end(args);
|
|
return ret;
|
|
}
|
|
|
|
/*! \brief assert a condition is true, use this to handle debug information */
|
|
inline void Assert(bool exp, const char *fmt, ...) {
|
|
if (!exp) {
|
|
std::string msg(kPrintBuffer, '\0');
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
|
va_end(args);
|
|
HandleAssertError(msg.c_str());
|
|
}
|
|
}
|
|
|
|
/*!\brief same as assert, but this is intended to be used as a message for users */
|
|
inline void Check(bool exp, const char *fmt, ...) {
|
|
if (!exp) {
|
|
std::string msg(kPrintBuffer, '\0');
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
|
va_end(args);
|
|
HandleCheckError(msg.c_str());
|
|
}
|
|
}
|
|
|
|
/*! \brief report error message, same as check */
|
|
inline void Error(const char *fmt, ...) {
|
|
{
|
|
std::string msg(kPrintBuffer, '\0');
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
|
va_end(args);
|
|
HandleCheckError(msg.c_str());
|
|
}
|
|
}
|
|
#endif // RABIT_STRICT_CXX98_
|
|
|
|
/*! \brief replace fopen, report error when the file open fails */
|
|
inline std::FILE *FopenCheck(const char *fname, const char *flag) {
|
|
std::FILE *fp = fopen64(fname, flag);
|
|
Check(fp != NULL, "can not open file \"%s\"\n", fname);
|
|
return fp;
|
|
}
|
|
} // namespace utils
|
|
// easy utils that can be directly accessed in xgboost
|
|
/*! \brief get the beginning address of a vector */
|
|
template<typename T>
|
|
inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
|
|
if (vec.size() == 0) {
|
|
return NULL;
|
|
} else {
|
|
return &vec[0];
|
|
}
|
|
}
|
|
/*! \brief get the beginning address of a vector */
|
|
template<typename T>
|
|
inline const T *BeginPtr(const std::vector<T> &vec) { // NOLINT(*)
|
|
if (vec.size() == 0) {
|
|
return NULL;
|
|
} else {
|
|
return &vec[0];
|
|
}
|
|
}
|
|
inline char* BeginPtr(std::string &str) { // NOLINT(*)
|
|
if (str.length() == 0) return NULL;
|
|
return &str[0];
|
|
}
|
|
inline const char* BeginPtr(const std::string &str) {
|
|
if (str.length() == 0) return NULL;
|
|
return &str[0];
|
|
}
|
|
} // namespace rabit
|
|
#endif // RABIT_INTERNAL_UTILS_H_
|