Squashed 'subtree/rabit/' changes from 091634b..59e63bc

59e63bc minor
6233050 ok
14477f9 add namenode
75a6d34 add libhdfs opts
e3c76bf minmum fix
8b3c435 chg
2035799 test code
7751b2b add debug
7690313 ok
bd346b4 ok
faba1dc add testload
6f7783e add testload
e5f0340 ok
3ed9ec8 chg
e552ac4 ask for more ram in am
b2505e3 only stop nm when sucess
bc696c9 add queue info
f3e867e add option queue
5dc843c refactor fileio
cd9c81b quick fix
1e23af2 add virtual destructor to iseekstream
f165ffb fix hdfs
8cc6508 allow demo to pass in env
fad4d69 ok
0fd6197 fix more
7423837 fix more
d25de54 add temporal solution, run_yarn_prog.py
e5a9e31 final attempt
ed3bee8 add command back
0774000 add hdfs to resource
9b66e7e fix hadoop
6812f14 ok
08e1c16 change hadoop prefix back to hadoop home
d6b6828 Update build.sh
146e069 bugfix: logical boundary for ring buffer
19cb685 ok
4cf3c13 Merge branch 'master' of ssh://github.com/tqchen/rabit
20daddb add tracker
c57dad8 add ringbased passing and batch schedule
295d8a1 update
994cb02 add sge
014c866 OK

git-subtree-dir: subtree/rabit
git-subtree-split: 59e63bc135
This commit is contained in:
tqchen
2015-03-21 00:44:31 -07:00
parent 13a319ca01
commit 75bf97b575
34 changed files with 869 additions and 207 deletions

View File

@@ -38,6 +38,7 @@ class StreamBufferReader {
}
}
}
/*! \brief whether we are reaching the end of file */
inline bool AtEnd(void) const {
return read_len_ == 0;
}

View File

@@ -19,6 +19,7 @@ class FileStream : public utils::ISeekStream {
public:
explicit FileStream(const char *fname, const char *mode)
: use_stdio(false) {
using namespace std;
#ifndef RABIT_STRICT_CXX98_
if (!strcmp(fname, "stdin")) {
use_stdio = true; fp = stdin;
@@ -51,7 +52,7 @@ class FileStream : public utils::ISeekStream {
return std::ftell(fp);
}
virtual bool AtEnd(void) const {
return feof(fp) != 0;
return std::feof(fp) != 0;
}
inline void Close(void) {
if (fp != NULL && !use_stdio) {
@@ -60,45 +61,50 @@ class FileStream : public utils::ISeekStream {
}
private:
FILE *fp;
std::FILE *fp;
bool use_stdio;
};
/*! \brief line split from normal file system */
class FileSplit : public LineSplitBase {
class FileProvider : public LineSplitter::IFileProvider {
public:
explicit FileSplit(const char *uri, unsigned rank, unsigned nsplit) {
LineSplitBase::SplitNames(&fnames_, uri, "#");
explicit FileProvider(const char *uri) {
LineSplitter::SplitNames(&fnames_, uri, "#");
std::vector<size_t> fsize;
for (size_t i = 0; i < fnames_.size(); ++i) {
if (!strncmp(fnames_[i].c_str(), "file://", 7)) {
if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
std::string tmp = fnames_[i].c_str() + 7;
fnames_[i] = tmp;
}
fsize.push_back(GetFileSize(fnames_[i].c_str()));
size_t fz = GetFileSize(fnames_[i].c_str());
if (fz != 0) {
fsize_.push_back(fz);
}
}
LineSplitBase::Init(fsize, rank, nsplit);
}
virtual ~FileSplit(void) {}
protected:
virtual utils::ISeekStream *GetFile(size_t file_index) {
// destrucor
virtual ~FileProvider(void) {}
virtual utils::ISeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new FileStream(fnames_[file_index].c_str(), "rb");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
private:
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
// get file size
inline static size_t GetFileSize(const char *fname) {
FILE *fp = utils::FopenCheck(fname, "rb");
std::FILE *fp = utils::FopenCheck(fname, "rb");
// NOTE: fseek may not be good, but serves as ok solution
fseek(fp, 0, SEEK_END);
size_t fsize = static_cast<size_t>(ftell(fp));
fclose(fp);
std::fseek(fp, 0, SEEK_END);
size_t fsize = static_cast<size_t>(std::ftell(fp));
std::fclose(fp);
return fsize;
}
private:
// file names
std::vector<std::string> fnames_;
};
} // namespace io
} // namespace rabit

View File

@@ -6,6 +6,7 @@
* \author Tianqi Chen
*/
#include <string>
#include <cstdlib>
#include <vector>
#include <hdfs.h>
#include <errno.h>
@@ -15,11 +16,15 @@
/*! \brief io interface */
namespace rabit {
namespace io {
class HDFSStream : public utils::ISeekStream {
class HDFSStream : public ISeekStream {
public:
HDFSStream(hdfsFS fs, const char *fname, const char *mode)
: fs_(fs), at_end_(false) {
int flag;
HDFSStream(hdfsFS fs,
const char *fname,
const char *mode,
bool disconnect_when_done)
: fs_(fs), at_end_(false),
disconnect_when_done_(disconnect_when_done) {
int flag = 0;
if (!strcmp(mode, "r")) {
flag = O_RDONLY;
} else if (!strcmp(mode, "w")) {
@@ -35,6 +40,9 @@ class HDFSStream : public utils::ISeekStream {
}
virtual ~HDFSStream(void) {
this->Close();
if (disconnect_when_done_) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
}
virtual size_t Read(void *ptr, size_t size) {
tSize nread = hdfsRead(fs_, fp_, ptr, size);
@@ -86,52 +94,69 @@ class HDFSStream : public utils::ISeekStream {
}
}
inline static std::string GetNameNode(void) {
const char *nn = getenv("rabit_hdfs_namenode");
if (nn == NULL) {
return std::string("default");
} else {
return std::string(nn);
}
}
private:
hdfsFS fs_;
hdfsFile fp_;
bool at_end_;
bool disconnect_when_done_;
};
/*! \brief line split from normal file system */
class HDFSSplit : public LineSplitBase {
class HDFSProvider : public LineSplitter::IFileProvider {
public:
explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
fs_ = hdfsConnect("default", 0);
explicit HDFSProvider(const char *uri) {
fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
std::vector<std::string> paths;
LineSplitBase::SplitNames(&paths, uri, "#");
LineSplitter::SplitNames(&paths, uri, "#");
// get the files
std::vector<size_t> fsize;
for (size_t i = 0; i < paths.size(); ++i) {
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
if (info->mKind == 'D') {
int nentry;
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
for (int i = 0; i < nentry; ++i) {
if (files[i].mKind == 'F') {
fsize.push_back(files[i].mSize);
if (files[i].mKind == 'F' && files[i].mSize != 0) {
fsize_.push_back(files[i].mSize);
fnames_.push_back(std::string(files[i].mName));
}
}
hdfsFreeFileInfo(files, nentry);
} else {
fsize.push_back(info->mSize);
fnames_.push_back(std::string(info->mName));
if (info->mSize != 0) {
fsize_.push_back(info->mSize);
fnames_.push_back(std::string(info->mName));
}
}
hdfsFreeFileInfo(info, 1);
}
LineSplitBase::Init(fsize, rank, nsplit);
}
virtual ~HDFSSplit(void) {}
protected:
virtual utils::ISeekStream *GetFile(size_t file_index) {
virtual ~HDFSProvider(void) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
virtual ISeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r");
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
}
private:
// hdfs handle
hdfsFS fs_;
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
};

View File

@@ -25,20 +25,21 @@ namespace io {
inline InputSplit *CreateInputSplit(const char *uri,
unsigned part,
unsigned nsplit) {
using namespace std;
if (!strcmp(uri, "stdin")) {
return new SingleFileSplit(uri);
}
if (!strncmp(uri, "file://", 7)) {
return new FileSplit(uri, part, nsplit);
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new HDFSSplit(uri, part, nsplit);
return new LineSplitter(new HDFSProvider(uri), part, nsplit);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif
}
return new FileSplit(uri, part, nsplit);
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
/*!
* \brief create an stream, the stream must be able to close
@@ -48,12 +49,14 @@ inline InputSplit *CreateInputSplit(const char *uri,
* \param mode can be 'w' or 'r' for read or write
*/
inline IStream *CreateStream(const char *uri, const char *mode) {
using namespace std;
if (!strncmp(uri, "file://", 7)) {
return new FileStream(uri + 7, mode);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new HDFSStream(hdfsConnect("default", 0), uri, mode);
return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
uri, mode, true);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif

View File

@@ -19,6 +19,7 @@ namespace rabit {
* \brief namespace to handle input split and filesystem interfacing
*/
namespace io {
/*! \brief reused ISeekStream's definition */
typedef utils::ISeekStream ISeekStream;
/*!
* \brief user facing input split helper,

View File

@@ -1,7 +1,7 @@
#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
/*!
* \file line_split-inl.h
* \std::FILE line_split-inl.h
* \brief base implementation of line-spliter
* \author Tianqi Chen
*/
@@ -15,11 +15,42 @@
namespace rabit {
namespace io {
class LineSplitBase : public InputSplit {
/*! \brief class that split the files by line */
class LineSplitter : public InputSplit {
public:
virtual ~LineSplitBase() {
if (fs_ != NULL) delete fs_;
class IFileProvider {
public:
/*!
* \brief get the seek stream of given file_index
* \return the corresponding seek stream at head of the stream
* the seek stream's resource can be freed by calling delete
*/
virtual ISeekStream *Open(size_t file_index) = 0;
/*!
* \return const reference to size of each files
*/
virtual const std::vector<size_t> &FileSize(void) const = 0;
// virtual destructor
virtual ~IFileProvider() {}
};
// constructor
explicit LineSplitter(IFileProvider *provider,
unsigned rank,
unsigned nsplit)
: provider_(provider), fs_(NULL),
reader_(kBufferSize) {
this->Init(provider_->FileSize(), rank, nsplit);
}
// destructor
virtual ~LineSplitter() {
if (fs_ != NULL) {
delete fs_; fs_ = NULL;
}
// delete provider after destructing the streams
delete provider_;
}
// get next line
virtual bool NextLine(std::string *out_data) {
if (file_ptr_ >= file_ptr_end_ &&
offset_curr_ >= offset_end_) return false;
@@ -29,15 +60,15 @@ class LineSplitBase : public InputSplit {
if (reader_.AtEnd()) {
if (out_data->length() != 0) return true;
file_ptr_ += 1;
if (offset_curr_ >= offset_end_) return false;
if (offset_curr_ != file_offset_[file_ptr_]) {
utils::Error("warning:file size not calculated correctly\n");
utils::Error("warning: FILE size not calculated correctly\n");
offset_curr_ = file_offset_[file_ptr_];
}
if (offset_curr_ >= offset_end_) return false;
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
"boundary check");
delete fs_;
fs_ = this->GetFile(file_ptr_);
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
} else {
++offset_curr_;
@@ -51,12 +82,24 @@ class LineSplitBase : public InputSplit {
}
}
}
protected:
// constructor
LineSplitBase(void)
: fs_(NULL), reader_(kBufferSize) {
/*!
* \brief split names given
* \param out_fname output std::FILE names
* \param uri_ the iput uri std::FILE
* \param dlm deliminetr
*/
inline static void SplitNames(std::vector<std::string> *out_fname,
const char *uri_,
const char *dlm) {
std::string uri = uri_;
char *p = std::strtok(BeginPtr(uri), dlm);
while (p != NULL) {
out_fname->push_back(std::string(p));
p = std::strtok(NULL, dlm);
}
}
private:
/*!
* \brief initialize the line spliter,
* \param file_size, size of each files
@@ -82,7 +125,7 @@ class LineSplitBase : public InputSplit {
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_end_) - file_offset_.begin() - 1;
fs_ = GetFile(file_ptr_);
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
// try to set the starting position correctly
if (file_offset_[file_ptr_] != offset_begin_) {
@@ -94,28 +137,10 @@ class LineSplitBase : public InputSplit {
}
}
}
/*!
* \brief get the seek stream of given file_index
* \return the corresponding seek stream at head of file
*/
virtual utils::ISeekStream *GetFile(size_t file_index) = 0;
/*!
* \brief split names given
* \param out_fname output file names
* \param uri_ the iput uri file
* \param dlm deliminetr
*/
inline static void SplitNames(std::vector<std::string> *out_fname,
const char *uri_,
const char *dlm) {
std::string uri = uri_;
char *p = strtok(BeginPtr(uri), dlm);
while (p != NULL) {
out_fname->push_back(std::string(p));
p = strtok(NULL, dlm);
}
}
private:
/*! \brief FileProvider */
IFileProvider *provider_;
/*! \brief current input stream */
utils::ISeekStream *fs_;
/*! \brief file pointer of which file to read on */
@@ -136,11 +161,11 @@ class LineSplitBase : public InputSplit {
const static size_t kBufferSize = 256;
};
/*! \brief line split from single file */
/*! \brief line split from single std::FILE */
class SingleFileSplit : public InputSplit {
public:
explicit SingleFileSplit(const char *fname) {
if (!strcmp(fname, "stdin")) {
if (!std::strcmp(fname, "stdin")) {
#ifndef RABIT_STRICT_CXX98_
use_stdin_ = true; fp_ = stdin;
#endif
@@ -151,13 +176,13 @@ class SingleFileSplit : public InputSplit {
end_of_file_ = false;
}
virtual ~SingleFileSplit(void) {
if (!use_stdin_) fclose(fp_);
if (!use_stdin_) std::fclose(fp_);
}
virtual bool NextLine(std::string *out_data) {
if (end_of_file_) return false;
out_data->clear();
while (true) {
char c = fgetc(fp_);
char c = std::fgetc(fp_);
if (c == EOF) {
end_of_file_ = true;
}
@@ -172,7 +197,7 @@ class SingleFileSplit : public InputSplit {
}
private:
FILE *fp_;
std::FILE *fp_;
bool use_stdin_;
bool end_of_file_;
};