diff --git a/include/dmlc/base.h b/include/dmlc/base.h new file mode 100644 index 000000000..89218b4a9 --- /dev/null +++ b/include/dmlc/base.h @@ -0,0 +1,106 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file base.h + * \brief defines configuration macros + */ +#ifndef DMLC_BASE_H_ +#define DMLC_BASE_H_ + +/*! \brief whether use glog for logging*/ +#ifndef DMLC_USE_GLOG +#define DMLC_USE_GLOG 0 +#endif + +/*! \brief whether compile with hdfs support */ +#ifndef DMLC_USE_HDFS +#define DMLC_USE_HDFS 0 +#endif + +/*! \brief whether compile with s3 support */ +#ifndef DMLC_USE_S3 +#define DMLC_USE_S3 0 +#endif + +/*! \brief whether or not use parameter server */ +#ifndef DMLC_USE_PS +#define DMLC_USE_PS 0 +#endif + +/*! \brief whether or not use c++11 support */ +#ifndef DMLC_USE_CXX11 +#define DMLC_USE_CXX11 defined(__GXX_EXPERIMENTAL_CXX0X) || __cplusplus >= 201103L || defined(_MSC_VER) +#endif + +/// +/// code block to handle optionally loading +/// +#if !defined(__GNUC__) +#define fopen64 std::fopen +#endif +#ifdef _MSC_VER +// NOTE: sprintf_s is not equivalent to snprintf, +// they are equivalent when success, which is sufficient for our case +#define snprintf sprintf_s +#define vsnprintf vsprintf_s +#else +#ifdef _FILE_OFFSET_BITS +#if _FILE_OFFSET_BITS == 32 +#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit") +#endif +#endif + +#ifdef __APPLE__ +#define off64_t off_t +#define fopen64 std::fopen +#endif + +extern "C" { +#include +} +#endif + +#ifdef _MSC_VER +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +typedef __int64 int64_t; +#else +#include +#endif +#include +#include + +/*! \brief namespace for dmlc */ +namespace dmlc { +/*! + * \brief safely get the beginning address of a vector + * \param vec input vector + * \return beginning address of a vector + */ +template +inline T *BeginPtr(std::vector &vec) { + if (vec.size() == 0) { + return NULL; + } else { + return &vec[0]; + } +} +/*! \brief get the beginning address of a vector */ +template +inline const T *BeginPtr(const std::vector &vec) { + if (vec.size() == 0) { + return NULL; + } else { + return &vec[0]; + } +} +inline char* BeginPtr(std::string &str) { + if (str.length() == 0) return NULL; + return &str[0]; +} +inline const char* BeginPtr(const std::string &str) { + if (str.length() == 0) return NULL; + return &str[0]; +} +} // namespace dmlc +#endif // DMLC_BASE_H_ diff --git a/include/dmlc/io.h b/include/dmlc/io.h index 017181c54..041d6dd36 100644 --- a/include/dmlc/io.h +++ b/include/dmlc/io.h @@ -11,6 +11,7 @@ #include #include #include +#include "./base.h" /*! \brief namespace for dmlc */ namespace dmlc { @@ -128,12 +129,25 @@ class InputSplit { /*! \brief size of the memory region */ size_t size; }; + /*! + * \brief hint the inputsplit how large the chunk size + * it should return when implementing NextChunk + * this is a hint so may not be enforced, + * but InputSplit will try adjust its internal buffer + * size to the hinted value + * \param chunk_size the chunk size + */ + virtual void HintChunkSize(size_t chunk_size) {} /*! \brief reset the position of InputSplit to beginning */ virtual void BeforeFirst(void) = 0; /*! * \brief get the next record, the returning value * is valid until next call to NextRecord or NextChunk * caller can modify the memory content of out_rec + * + * For text, out_rec contains a single line + * For recordio, out_rec contains one record content(with header striped) + * * \param out_rec used to store the result * \return true if we can successfully get next record * false if we reached end of split @@ -144,7 +158,7 @@ class InputSplit { * \brief get a chunk of memory that can contain multiple records, * the caller needs to parse the content of the resulting chunk, * for text file, out_chunk can contain data of multiple lines - * for recordio, out_chunk can contain data of multiple records + * for recordio, out_chunk can contain multiple records(including headers) * * This function ensures there won't be partial record in the chunk * caller can modify the memory content of out_chunk, @@ -157,6 +171,7 @@ class InputSplit { * \return true if we can successfully get next record * false if we reached end of split * \sa InputSplit::Create for definition of record + * \sa RecordIOChunkReader to parse recordio content from out_chunk */ virtual bool NextChunk(Blob *out_chunk) = 0; /*! \brief destructor*/