28ca7be add linear readme ca4b20f add linear readme 1133628 add linear readme 6a11676 update docs a607047 Update build.sh 2c1cfd8 complete yarn 4f28e32 change formater 2fbda81 fix stdin input 3258bcf checkin yarn master 67ebf81 allow setup from env variables 9b6bf57 fix hdfs 395d5c2 add make system 88ce767 refactor io, initial hdfs file access need test 19be870 chgs a1bd3c6 Merge branch 'master' of ssh://github.com/tqchen/rabit 1a573f9 introduce input split 29476f1 fix timer issue git-subtree-dir: subtree/rabit git-subtree-split: 28ca7becbdf6503e6b1398588a969efb164c9701
219 lines
6.7 KiB
C++
219 lines
6.7 KiB
C++
#ifndef RABIT_LEARN_IO_BASE64_INL_H_
|
|
#define RABIT_LEARN_IO_BASE64_INL_H_
|
|
/*!
|
|
* \file base64.h
|
|
* \brief data stream support to input and output from/to base64 stream
|
|
* base64 is easier to store and pass as text format in mapreduce
|
|
* \author Tianqi Chen
|
|
*/
|
|
#include <cctype>
|
|
#include <cstdio>
|
|
#include "./io.h"
|
|
#include "./buffer_reader-inl.h"
|
|
|
|
namespace rabit {
|
|
namespace io {
|
|
/*! \brief namespace of base64 decoding and encoding table */
|
|
namespace base64 {
|
|
const char DecodeTable[] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
62, // '+'
|
|
0, 0, 0,
|
|
63, // '/'
|
|
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
|
|
0, 0, 0, 0, 0, 0, 0,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
|
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
|
|
0, 0, 0, 0, 0, 0,
|
|
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
|
|
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
|
|
};
|
|
static const char EncodeTable[] =
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
} // namespace base64
|
|
/*! \brief the stream that reads from base64, note we take from file pointers */
|
|
class Base64InStream: public IStream {
|
|
public:
|
|
explicit Base64InStream(IStream *fs) : reader_(256) {
|
|
reader_.set_stream(fs);
|
|
num_prev = 0; tmp_ch = 0;
|
|
}
|
|
/*!
|
|
* \brief initialize the stream position to beginning of next base64 stream
|
|
* call this function before actually start read
|
|
*/
|
|
inline void InitPosition(void) {
|
|
// get a charater
|
|
do {
|
|
tmp_ch = reader_.GetChar();
|
|
} while (isspace(tmp_ch));
|
|
}
|
|
/*! \brief whether current position is end of a base64 stream */
|
|
inline bool IsEOF(void) const {
|
|
return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
|
|
}
|
|
virtual size_t Read(void *ptr, size_t size) {
|
|
using base64::DecodeTable;
|
|
if (size == 0) return 0;
|
|
// use tlen to record left size
|
|
size_t tlen = size;
|
|
unsigned char *cptr = static_cast<unsigned char*>(ptr);
|
|
// if anything left, load from previous buffered result
|
|
if (num_prev != 0) {
|
|
if (num_prev == 2) {
|
|
if (tlen >= 2) {
|
|
*cptr++ = buf_prev[0];
|
|
*cptr++ = buf_prev[1];
|
|
tlen -= 2;
|
|
num_prev = 0;
|
|
} else {
|
|
// assert tlen == 1
|
|
*cptr++ = buf_prev[0]; --tlen;
|
|
buf_prev[0] = buf_prev[1];
|
|
num_prev = 1;
|
|
}
|
|
} else {
|
|
// assert num_prev == 1
|
|
*cptr++ = buf_prev[0]; --tlen; num_prev = 0;
|
|
}
|
|
}
|
|
if (tlen == 0) return size;
|
|
int nvalue;
|
|
// note: everything goes with 4 bytes in Base64
|
|
// so we process 4 bytes a unit
|
|
while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
|
|
// first byte
|
|
nvalue = DecodeTable[tmp_ch] << 18;
|
|
{
|
|
// second byte
|
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
|
"invalid base64 format");
|
|
nvalue |= DecodeTable[tmp_ch] << 12;
|
|
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
|
|
}
|
|
{
|
|
// third byte
|
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
|
"invalid base64 format");
|
|
// handle termination
|
|
if (tmp_ch == '=') {
|
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
|
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
|
|
"invalid base64 format");
|
|
break;
|
|
}
|
|
nvalue |= DecodeTable[tmp_ch] << 6;
|
|
if (tlen) {
|
|
*cptr++ = (nvalue >> 8) & 0xFF; --tlen;
|
|
} else {
|
|
buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
|
|
}
|
|
}
|
|
{
|
|
// fourth byte
|
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
|
"invalid base64 format");
|
|
if (tmp_ch == '=') {
|
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
|
|
"invalid base64 format");
|
|
break;
|
|
}
|
|
nvalue |= DecodeTable[tmp_ch];
|
|
if (tlen) {
|
|
*cptr++ = nvalue & 0xFF; --tlen;
|
|
} else {
|
|
buf_prev[num_prev ++] = nvalue & 0xFF;
|
|
}
|
|
}
|
|
// get next char
|
|
tmp_ch = reader_.GetChar();
|
|
}
|
|
if (kStrictCheck) {
|
|
utils::Check(tlen == 0, "Base64InStream: read incomplete");
|
|
}
|
|
return size - tlen;
|
|
}
|
|
virtual void Write(const void *ptr, size_t size) {
|
|
utils::Error("Base64InStream do not support write");
|
|
}
|
|
|
|
private:
|
|
StreamBufferReader reader_;
|
|
int tmp_ch;
|
|
int num_prev;
|
|
unsigned char buf_prev[2];
|
|
// whether we need to do strict check
|
|
static const bool kStrictCheck = false;
|
|
};
|
|
/*! \brief the stream that write to base64, note we take from file pointers */
|
|
class Base64OutStream: public IStream {
|
|
public:
|
|
explicit Base64OutStream(IStream *fp) : fp(fp) {
|
|
buf_top = 0;
|
|
}
|
|
virtual void Write(const void *ptr, size_t size) {
|
|
using base64::EncodeTable;
|
|
size_t tlen = size;
|
|
const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
|
|
while (tlen) {
|
|
while (buf_top < 3 && tlen != 0) {
|
|
buf[++buf_top] = *cptr++; --tlen;
|
|
}
|
|
if (buf_top == 3) {
|
|
// flush 4 bytes out
|
|
PutChar(EncodeTable[buf[1] >> 2]);
|
|
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
|
|
PutChar(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F]);
|
|
PutChar(EncodeTable[buf[3] & 0x3F]);
|
|
buf_top = 0;
|
|
}
|
|
}
|
|
}
|
|
virtual size_t Read(void *ptr, size_t size) {
|
|
utils::Error("Base64OutStream do not support read");
|
|
return 0;
|
|
}
|
|
/*!
|
|
* \brief finish writing of all current base64 stream, do some post processing
|
|
* \param endch charater to put to end of stream, if it is EOF, then nothing will be done
|
|
*/
|
|
inline void Finish(char endch = EOF) {
|
|
using base64::EncodeTable;
|
|
if (buf_top == 1) {
|
|
PutChar(EncodeTable[buf[1] >> 2]);
|
|
PutChar(EncodeTable[(buf[1] << 4) & 0x3F]);
|
|
PutChar('=');
|
|
PutChar('=');
|
|
}
|
|
if (buf_top == 2) {
|
|
PutChar(EncodeTable[buf[1] >> 2]);
|
|
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
|
|
PutChar(EncodeTable[(buf[2] << 2) & 0x3F]);
|
|
PutChar('=');
|
|
}
|
|
buf_top = 0;
|
|
if (endch != EOF) PutChar(endch);
|
|
this->Flush();
|
|
}
|
|
|
|
private:
|
|
IStream *fp;
|
|
int buf_top;
|
|
unsigned char buf[4];
|
|
std::string out_buf;
|
|
const static size_t kBufferSize = 256;
|
|
|
|
inline void PutChar(char ch) {
|
|
out_buf += ch;
|
|
if (out_buf.length() >= kBufferSize) Flush();
|
|
}
|
|
inline void Flush(void) {
|
|
fp->Write(BeginPtr(out_buf), out_buf.length());
|
|
out_buf.clear();
|
|
}
|
|
};
|
|
} // namespace utils
|
|
} // namespace rabit
|
|
#endif // RABIT_LEARN_UTILS_BASE64_INL_H_
|