checkin new dmlc interface

This commit is contained in:
tqchen 2015-04-29 20:17:27 -07:00
parent d1d2ab4599
commit a5d77ca08d
2 changed files with 122 additions and 1 deletions

106
include/dmlc/base.h Normal file
View File

@ -0,0 +1,106 @@
/*!
* Copyright (c) 2015 by Contributors
* \file base.h
* \brief defines configuration macros
*/
#ifndef DMLC_BASE_H_
#define DMLC_BASE_H_
/*! \brief whether use glog for logging*/
#ifndef DMLC_USE_GLOG
#define DMLC_USE_GLOG 0
#endif
/*! \brief whether compile with hdfs support */
#ifndef DMLC_USE_HDFS
#define DMLC_USE_HDFS 0
#endif
/*! \brief whether compile with s3 support */
#ifndef DMLC_USE_S3
#define DMLC_USE_S3 0
#endif
/*! \brief whether or not use parameter server */
#ifndef DMLC_USE_PS
#define DMLC_USE_PS 0
#endif
/*! \brief whether or not use c++11 support */
#ifndef DMLC_USE_CXX11
#define DMLC_USE_CXX11 defined(__GXX_EXPERIMENTAL_CXX0X) || __cplusplus >= 201103L || defined(_MSC_VER)
#endif
///
/// code block to handle optionally loading
///
#if !defined(__GNUC__)
#define fopen64 std::fopen
#endif
#ifdef _MSC_VER
// NOTE: sprintf_s is not equivalent to snprintf,
// they are equivalent when success, which is sufficient for our case
#define snprintf sprintf_s
#define vsnprintf vsprintf_s
#else
#ifdef _FILE_OFFSET_BITS
#if _FILE_OFFSET_BITS == 32
#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit")
#endif
#endif
#ifdef __APPLE__
#define off64_t off_t
#define fopen64 std::fopen
#endif
extern "C" {
#include <sys/types.h>
}
#endif
#ifdef _MSC_VER
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
typedef __int64 int64_t;
#else
#include <inttypes.h>
#endif
#include <vector>
#include <string>
/*! \brief namespace for dmlc */
namespace dmlc {
/*!
* \brief safely get the beginning address of a vector
* \param vec input vector
* \return beginning address of a vector
*/
template<typename T>
inline T *BeginPtr(std::vector<T> &vec) {
if (vec.size() == 0) {
return NULL;
} else {
return &vec[0];
}
}
/*! \brief get the beginning address of a vector */
template<typename T>
inline const T *BeginPtr(const std::vector<T> &vec) {
if (vec.size() == 0) {
return NULL;
} else {
return &vec[0];
}
}
inline char* BeginPtr(std::string &str) {
if (str.length() == 0) return NULL;
return &str[0];
}
inline const char* BeginPtr(const std::string &str) {
if (str.length() == 0) return NULL;
return &str[0];
}
} // namespace dmlc
#endif // DMLC_BASE_H_

View File

@ -11,6 +11,7 @@
#include <istream> #include <istream>
#include <ostream> #include <ostream>
#include <streambuf> #include <streambuf>
#include "./base.h"
/*! \brief namespace for dmlc */ /*! \brief namespace for dmlc */
namespace dmlc { namespace dmlc {
@ -128,12 +129,25 @@ class InputSplit {
/*! \brief size of the memory region */ /*! \brief size of the memory region */
size_t size; size_t size;
}; };
/*!
* \brief hint the inputsplit how large the chunk size
* it should return when implementing NextChunk
* this is a hint so may not be enforced,
* but InputSplit will try adjust its internal buffer
* size to the hinted value
* \param chunk_size the chunk size
*/
virtual void HintChunkSize(size_t chunk_size) {}
/*! \brief reset the position of InputSplit to beginning */ /*! \brief reset the position of InputSplit to beginning */
virtual void BeforeFirst(void) = 0; virtual void BeforeFirst(void) = 0;
/*! /*!
* \brief get the next record, the returning value * \brief get the next record, the returning value
* is valid until next call to NextRecord or NextChunk * is valid until next call to NextRecord or NextChunk
* caller can modify the memory content of out_rec * caller can modify the memory content of out_rec
*
* For text, out_rec contains a single line
* For recordio, out_rec contains one record content(with header striped)
*
* \param out_rec used to store the result * \param out_rec used to store the result
* \return true if we can successfully get next record * \return true if we can successfully get next record
* false if we reached end of split * false if we reached end of split
@ -144,7 +158,7 @@ class InputSplit {
* \brief get a chunk of memory that can contain multiple records, * \brief get a chunk of memory that can contain multiple records,
* the caller needs to parse the content of the resulting chunk, * the caller needs to parse the content of the resulting chunk,
* for text file, out_chunk can contain data of multiple lines * for text file, out_chunk can contain data of multiple lines
* for recordio, out_chunk can contain data of multiple records * for recordio, out_chunk can contain multiple records(including headers)
* *
* This function ensures there won't be partial record in the chunk * This function ensures there won't be partial record in the chunk
* caller can modify the memory content of out_chunk, * caller can modify the memory content of out_chunk,
@ -157,6 +171,7 @@ class InputSplit {
* \return true if we can successfully get next record * \return true if we can successfully get next record
* false if we reached end of split * false if we reached end of split
* \sa InputSplit::Create for definition of record * \sa InputSplit::Create for definition of record
* \sa RecordIOChunkReader to parse recordio content from out_chunk
*/ */
virtual bool NextChunk(Blob *out_chunk) = 0; virtual bool NextChunk(Blob *out_chunk) = 0;
/*! \brief destructor*/ /*! \brief destructor*/