Squashed 'subtree/rabit/' changes from 091634b..59e63bc
59e63bcminor6233050ok14477f9add namenode75a6d34add libhdfs optse3c76bfminmum fix8b3c435chg2035799test code7751b2badd debug7690313okbd346b4okfaba1dcadd testload6f7783eadd testloade5f0340ok3ed9ec8chge552ac4ask for more ram in amb2505e3only stop nm when sucessbc696c9add queue infof3e867eadd option queue5dc843crefactor fileiocd9c81bquick fix1e23af2add virtual destructor to iseekstreamf165ffbfix hdfs8cc6508allow demo to pass in envfad4d69ok0fd6197fix more7423837fix mored25de54add temporal solution, run_yarn_prog.pye5a9e31final attempted3bee8add command back0774000add hdfs to resource9b66e7efix hadoop6812f14ok08e1c16change hadoop prefix back to hadoop homed6b6828Update build.sh146e069bugfix: logical boundary for ring buffer19cb685ok4cf3c13Merge branch 'master' of ssh://github.com/tqchen/rabit20daddbadd trackerc57dad8add ringbased passing and batch schedule295d8a1update994cb02add sge014c866OK git-subtree-dir: subtree/rabit git-subtree-split:59e63bc135
This commit is contained in:
@@ -38,6 +38,7 @@ class StreamBufferReader {
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief whether we are reaching the end of file */
|
||||
inline bool AtEnd(void) const {
|
||||
return read_len_ == 0;
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ class FileStream : public utils::ISeekStream {
|
||||
public:
|
||||
explicit FileStream(const char *fname, const char *mode)
|
||||
: use_stdio(false) {
|
||||
using namespace std;
|
||||
#ifndef RABIT_STRICT_CXX98_
|
||||
if (!strcmp(fname, "stdin")) {
|
||||
use_stdio = true; fp = stdin;
|
||||
@@ -51,7 +52,7 @@ class FileStream : public utils::ISeekStream {
|
||||
return std::ftell(fp);
|
||||
}
|
||||
virtual bool AtEnd(void) const {
|
||||
return feof(fp) != 0;
|
||||
return std::feof(fp) != 0;
|
||||
}
|
||||
inline void Close(void) {
|
||||
if (fp != NULL && !use_stdio) {
|
||||
@@ -60,45 +61,50 @@ class FileStream : public utils::ISeekStream {
|
||||
}
|
||||
|
||||
private:
|
||||
FILE *fp;
|
||||
std::FILE *fp;
|
||||
bool use_stdio;
|
||||
};
|
||||
|
||||
/*! \brief line split from normal file system */
|
||||
class FileSplit : public LineSplitBase {
|
||||
class FileProvider : public LineSplitter::IFileProvider {
|
||||
public:
|
||||
explicit FileSplit(const char *uri, unsigned rank, unsigned nsplit) {
|
||||
LineSplitBase::SplitNames(&fnames_, uri, "#");
|
||||
explicit FileProvider(const char *uri) {
|
||||
LineSplitter::SplitNames(&fnames_, uri, "#");
|
||||
std::vector<size_t> fsize;
|
||||
for (size_t i = 0; i < fnames_.size(); ++i) {
|
||||
if (!strncmp(fnames_[i].c_str(), "file://", 7)) {
|
||||
if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
|
||||
std::string tmp = fnames_[i].c_str() + 7;
|
||||
fnames_[i] = tmp;
|
||||
}
|
||||
fsize.push_back(GetFileSize(fnames_[i].c_str()));
|
||||
size_t fz = GetFileSize(fnames_[i].c_str());
|
||||
if (fz != 0) {
|
||||
fsize_.push_back(fz);
|
||||
}
|
||||
}
|
||||
LineSplitBase::Init(fsize, rank, nsplit);
|
||||
}
|
||||
virtual ~FileSplit(void) {}
|
||||
|
||||
protected:
|
||||
virtual utils::ISeekStream *GetFile(size_t file_index) {
|
||||
// destrucor
|
||||
virtual ~FileProvider(void) {}
|
||||
virtual utils::ISeekStream *Open(size_t file_index) {
|
||||
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
|
||||
return new FileStream(fnames_[file_index].c_str(), "rb");
|
||||
}
|
||||
virtual const std::vector<size_t> &FileSize(void) const {
|
||||
return fsize_;
|
||||
}
|
||||
private:
|
||||
// file sizes
|
||||
std::vector<size_t> fsize_;
|
||||
// file names
|
||||
std::vector<std::string> fnames_;
|
||||
// get file size
|
||||
inline static size_t GetFileSize(const char *fname) {
|
||||
FILE *fp = utils::FopenCheck(fname, "rb");
|
||||
std::FILE *fp = utils::FopenCheck(fname, "rb");
|
||||
// NOTE: fseek may not be good, but serves as ok solution
|
||||
fseek(fp, 0, SEEK_END);
|
||||
size_t fsize = static_cast<size_t>(ftell(fp));
|
||||
fclose(fp);
|
||||
std::fseek(fp, 0, SEEK_END);
|
||||
size_t fsize = static_cast<size_t>(std::ftell(fp));
|
||||
std::fclose(fp);
|
||||
return fsize;
|
||||
}
|
||||
|
||||
private:
|
||||
// file names
|
||||
std::vector<std::string> fnames_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <string>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <hdfs.h>
|
||||
#include <errno.h>
|
||||
@@ -15,11 +16,15 @@
|
||||
/*! \brief io interface */
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
class HDFSStream : public utils::ISeekStream {
|
||||
class HDFSStream : public ISeekStream {
|
||||
public:
|
||||
HDFSStream(hdfsFS fs, const char *fname, const char *mode)
|
||||
: fs_(fs), at_end_(false) {
|
||||
int flag;
|
||||
HDFSStream(hdfsFS fs,
|
||||
const char *fname,
|
||||
const char *mode,
|
||||
bool disconnect_when_done)
|
||||
: fs_(fs), at_end_(false),
|
||||
disconnect_when_done_(disconnect_when_done) {
|
||||
int flag = 0;
|
||||
if (!strcmp(mode, "r")) {
|
||||
flag = O_RDONLY;
|
||||
} else if (!strcmp(mode, "w")) {
|
||||
@@ -35,6 +40,9 @@ class HDFSStream : public utils::ISeekStream {
|
||||
}
|
||||
virtual ~HDFSStream(void) {
|
||||
this->Close();
|
||||
if (disconnect_when_done_) {
|
||||
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
|
||||
}
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
tSize nread = hdfsRead(fs_, fp_, ptr, size);
|
||||
@@ -86,52 +94,69 @@ class HDFSStream : public utils::ISeekStream {
|
||||
}
|
||||
}
|
||||
|
||||
inline static std::string GetNameNode(void) {
|
||||
const char *nn = getenv("rabit_hdfs_namenode");
|
||||
if (nn == NULL) {
|
||||
return std::string("default");
|
||||
} else {
|
||||
return std::string(nn);
|
||||
}
|
||||
}
|
||||
private:
|
||||
hdfsFS fs_;
|
||||
hdfsFile fp_;
|
||||
bool at_end_;
|
||||
bool disconnect_when_done_;
|
||||
};
|
||||
|
||||
/*! \brief line split from normal file system */
|
||||
class HDFSSplit : public LineSplitBase {
|
||||
class HDFSProvider : public LineSplitter::IFileProvider {
|
||||
public:
|
||||
explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
|
||||
fs_ = hdfsConnect("default", 0);
|
||||
explicit HDFSProvider(const char *uri) {
|
||||
fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
|
||||
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
|
||||
std::vector<std::string> paths;
|
||||
LineSplitBase::SplitNames(&paths, uri, "#");
|
||||
LineSplitter::SplitNames(&paths, uri, "#");
|
||||
// get the files
|
||||
std::vector<size_t> fsize;
|
||||
for (size_t i = 0; i < paths.size(); ++i) {
|
||||
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
|
||||
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
|
||||
if (info->mKind == 'D') {
|
||||
int nentry;
|
||||
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
|
||||
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
|
||||
for (int i = 0; i < nentry; ++i) {
|
||||
if (files[i].mKind == 'F') {
|
||||
fsize.push_back(files[i].mSize);
|
||||
if (files[i].mKind == 'F' && files[i].mSize != 0) {
|
||||
fsize_.push_back(files[i].mSize);
|
||||
fnames_.push_back(std::string(files[i].mName));
|
||||
}
|
||||
}
|
||||
hdfsFreeFileInfo(files, nentry);
|
||||
} else {
|
||||
fsize.push_back(info->mSize);
|
||||
fnames_.push_back(std::string(info->mName));
|
||||
if (info->mSize != 0) {
|
||||
fsize_.push_back(info->mSize);
|
||||
fnames_.push_back(std::string(info->mName));
|
||||
}
|
||||
}
|
||||
hdfsFreeFileInfo(info, 1);
|
||||
}
|
||||
LineSplitBase::Init(fsize, rank, nsplit);
|
||||
}
|
||||
virtual ~HDFSSplit(void) {}
|
||||
|
||||
protected:
|
||||
virtual utils::ISeekStream *GetFile(size_t file_index) {
|
||||
virtual ~HDFSProvider(void) {
|
||||
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
|
||||
}
|
||||
virtual const std::vector<size_t> &FileSize(void) const {
|
||||
return fsize_;
|
||||
}
|
||||
virtual ISeekStream *Open(size_t file_index) {
|
||||
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
|
||||
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r");
|
||||
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
// hdfs handle
|
||||
hdfsFS fs_;
|
||||
// file sizes
|
||||
std::vector<size_t> fsize_;
|
||||
// file names
|
||||
std::vector<std::string> fnames_;
|
||||
};
|
||||
|
||||
@@ -25,20 +25,21 @@ namespace io {
|
||||
inline InputSplit *CreateInputSplit(const char *uri,
|
||||
unsigned part,
|
||||
unsigned nsplit) {
|
||||
using namespace std;
|
||||
if (!strcmp(uri, "stdin")) {
|
||||
return new SingleFileSplit(uri);
|
||||
}
|
||||
if (!strncmp(uri, "file://", 7)) {
|
||||
return new FileSplit(uri, part, nsplit);
|
||||
return new LineSplitter(new FileProvider(uri), part, nsplit);
|
||||
}
|
||||
if (!strncmp(uri, "hdfs://", 7)) {
|
||||
#if RABIT_USE_HDFS
|
||||
return new HDFSSplit(uri, part, nsplit);
|
||||
return new LineSplitter(new HDFSProvider(uri), part, nsplit);
|
||||
#else
|
||||
utils::Error("Please compile with RABIT_USE_HDFS=1");
|
||||
#endif
|
||||
}
|
||||
return new FileSplit(uri, part, nsplit);
|
||||
return new LineSplitter(new FileProvider(uri), part, nsplit);
|
||||
}
|
||||
/*!
|
||||
* \brief create an stream, the stream must be able to close
|
||||
@@ -48,12 +49,14 @@ inline InputSplit *CreateInputSplit(const char *uri,
|
||||
* \param mode can be 'w' or 'r' for read or write
|
||||
*/
|
||||
inline IStream *CreateStream(const char *uri, const char *mode) {
|
||||
using namespace std;
|
||||
if (!strncmp(uri, "file://", 7)) {
|
||||
return new FileStream(uri + 7, mode);
|
||||
}
|
||||
if (!strncmp(uri, "hdfs://", 7)) {
|
||||
#if RABIT_USE_HDFS
|
||||
return new HDFSStream(hdfsConnect("default", 0), uri, mode);
|
||||
return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
|
||||
uri, mode, true);
|
||||
#else
|
||||
utils::Error("Please compile with RABIT_USE_HDFS=1");
|
||||
#endif
|
||||
|
||||
@@ -19,6 +19,7 @@ namespace rabit {
|
||||
* \brief namespace to handle input split and filesystem interfacing
|
||||
*/
|
||||
namespace io {
|
||||
/*! \brief reused ISeekStream's definition */
|
||||
typedef utils::ISeekStream ISeekStream;
|
||||
/*!
|
||||
* \brief user facing input split helper,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||
#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||
/*!
|
||||
* \file line_split-inl.h
|
||||
* \std::FILE line_split-inl.h
|
||||
* \brief base implementation of line-spliter
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
@@ -15,11 +15,42 @@
|
||||
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
class LineSplitBase : public InputSplit {
|
||||
|
||||
/*! \brief class that split the files by line */
|
||||
class LineSplitter : public InputSplit {
|
||||
public:
|
||||
virtual ~LineSplitBase() {
|
||||
if (fs_ != NULL) delete fs_;
|
||||
class IFileProvider {
|
||||
public:
|
||||
/*!
|
||||
* \brief get the seek stream of given file_index
|
||||
* \return the corresponding seek stream at head of the stream
|
||||
* the seek stream's resource can be freed by calling delete
|
||||
*/
|
||||
virtual ISeekStream *Open(size_t file_index) = 0;
|
||||
/*!
|
||||
* \return const reference to size of each files
|
||||
*/
|
||||
virtual const std::vector<size_t> &FileSize(void) const = 0;
|
||||
// virtual destructor
|
||||
virtual ~IFileProvider() {}
|
||||
};
|
||||
// constructor
|
||||
explicit LineSplitter(IFileProvider *provider,
|
||||
unsigned rank,
|
||||
unsigned nsplit)
|
||||
: provider_(provider), fs_(NULL),
|
||||
reader_(kBufferSize) {
|
||||
this->Init(provider_->FileSize(), rank, nsplit);
|
||||
}
|
||||
// destructor
|
||||
virtual ~LineSplitter() {
|
||||
if (fs_ != NULL) {
|
||||
delete fs_; fs_ = NULL;
|
||||
}
|
||||
// delete provider after destructing the streams
|
||||
delete provider_;
|
||||
}
|
||||
// get next line
|
||||
virtual bool NextLine(std::string *out_data) {
|
||||
if (file_ptr_ >= file_ptr_end_ &&
|
||||
offset_curr_ >= offset_end_) return false;
|
||||
@@ -29,15 +60,15 @@ class LineSplitBase : public InputSplit {
|
||||
if (reader_.AtEnd()) {
|
||||
if (out_data->length() != 0) return true;
|
||||
file_ptr_ += 1;
|
||||
if (offset_curr_ >= offset_end_) return false;
|
||||
if (offset_curr_ != file_offset_[file_ptr_]) {
|
||||
utils::Error("warning:file size not calculated correctly\n");
|
||||
utils::Error("warning: FILE size not calculated correctly\n");
|
||||
offset_curr_ = file_offset_[file_ptr_];
|
||||
}
|
||||
if (offset_curr_ >= offset_end_) return false;
|
||||
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
|
||||
"boundary check");
|
||||
delete fs_;
|
||||
fs_ = this->GetFile(file_ptr_);
|
||||
fs_ = provider_->Open(file_ptr_);
|
||||
reader_.set_stream(fs_);
|
||||
} else {
|
||||
++offset_curr_;
|
||||
@@ -51,12 +82,24 @@ class LineSplitBase : public InputSplit {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// constructor
|
||||
LineSplitBase(void)
|
||||
: fs_(NULL), reader_(kBufferSize) {
|
||||
/*!
|
||||
* \brief split names given
|
||||
* \param out_fname output std::FILE names
|
||||
* \param uri_ the iput uri std::FILE
|
||||
* \param dlm deliminetr
|
||||
*/
|
||||
inline static void SplitNames(std::vector<std::string> *out_fname,
|
||||
const char *uri_,
|
||||
const char *dlm) {
|
||||
std::string uri = uri_;
|
||||
char *p = std::strtok(BeginPtr(uri), dlm);
|
||||
while (p != NULL) {
|
||||
out_fname->push_back(std::string(p));
|
||||
p = std::strtok(NULL, dlm);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/*!
|
||||
* \brief initialize the line spliter,
|
||||
* \param file_size, size of each files
|
||||
@@ -82,7 +125,7 @@ class LineSplitBase : public InputSplit {
|
||||
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
|
||||
file_offset_.end(),
|
||||
offset_end_) - file_offset_.begin() - 1;
|
||||
fs_ = GetFile(file_ptr_);
|
||||
fs_ = provider_->Open(file_ptr_);
|
||||
reader_.set_stream(fs_);
|
||||
// try to set the starting position correctly
|
||||
if (file_offset_[file_ptr_] != offset_begin_) {
|
||||
@@ -94,28 +137,10 @@ class LineSplitBase : public InputSplit {
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief get the seek stream of given file_index
|
||||
* \return the corresponding seek stream at head of file
|
||||
*/
|
||||
virtual utils::ISeekStream *GetFile(size_t file_index) = 0;
|
||||
/*!
|
||||
* \brief split names given
|
||||
* \param out_fname output file names
|
||||
* \param uri_ the iput uri file
|
||||
* \param dlm deliminetr
|
||||
*/
|
||||
inline static void SplitNames(std::vector<std::string> *out_fname,
|
||||
const char *uri_,
|
||||
const char *dlm) {
|
||||
std::string uri = uri_;
|
||||
char *p = strtok(BeginPtr(uri), dlm);
|
||||
while (p != NULL) {
|
||||
out_fname->push_back(std::string(p));
|
||||
p = strtok(NULL, dlm);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief FileProvider */
|
||||
IFileProvider *provider_;
|
||||
/*! \brief current input stream */
|
||||
utils::ISeekStream *fs_;
|
||||
/*! \brief file pointer of which file to read on */
|
||||
@@ -136,11 +161,11 @@ class LineSplitBase : public InputSplit {
|
||||
const static size_t kBufferSize = 256;
|
||||
};
|
||||
|
||||
/*! \brief line split from single file */
|
||||
/*! \brief line split from single std::FILE */
|
||||
class SingleFileSplit : public InputSplit {
|
||||
public:
|
||||
explicit SingleFileSplit(const char *fname) {
|
||||
if (!strcmp(fname, "stdin")) {
|
||||
if (!std::strcmp(fname, "stdin")) {
|
||||
#ifndef RABIT_STRICT_CXX98_
|
||||
use_stdin_ = true; fp_ = stdin;
|
||||
#endif
|
||||
@@ -151,13 +176,13 @@ class SingleFileSplit : public InputSplit {
|
||||
end_of_file_ = false;
|
||||
}
|
||||
virtual ~SingleFileSplit(void) {
|
||||
if (!use_stdin_) fclose(fp_);
|
||||
if (!use_stdin_) std::fclose(fp_);
|
||||
}
|
||||
virtual bool NextLine(std::string *out_data) {
|
||||
if (end_of_file_) return false;
|
||||
out_data->clear();
|
||||
while (true) {
|
||||
char c = fgetc(fp_);
|
||||
char c = std::fgetc(fp_);
|
||||
if (c == EOF) {
|
||||
end_of_file_ = true;
|
||||
}
|
||||
@@ -172,7 +197,7 @@ class SingleFileSplit : public InputSplit {
|
||||
}
|
||||
|
||||
private:
|
||||
FILE *fp_;
|
||||
std::FILE *fp_;
|
||||
bool use_stdin_;
|
||||
bool end_of_file_;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user