Merge commit '75bf97b57539e5572e7ae8eba72bac6562c63c07'

Conflicts:
	subtree/rabit/rabit-learn/io/line_split-inl.h
	subtree/rabit/yarn/build.sh
This commit is contained in:
tqchen
2015-03-21 00:48:34 -07:00
34 changed files with 856 additions and 201 deletions

2
subtree/rabit/rabit-learn/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
config.mk
*.log

View File

@@ -38,6 +38,7 @@ class StreamBufferReader {
}
}
}
/*! \brief whether we are reaching the end of file */
inline bool AtEnd(void) const {
return read_len_ == 0;
}

View File

@@ -66,27 +66,36 @@ class FileStream : public utils::ISeekStream {
};
/*! \brief line split from normal file system */
class FileSplit : public LineSplitBase {
class FileProvider : public LineSplitter::IFileProvider {
public:
explicit FileSplit(const char *uri, unsigned rank, unsigned nsplit) {
LineSplitBase::SplitNames(&fnames_, uri, "#");
explicit FileProvider(const char *uri) {
LineSplitter::SplitNames(&fnames_, uri, "#");
std::vector<size_t> fsize;
for (size_t i = 0; i < fnames_.size(); ++i) {
if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
std::string tmp = fnames_[i].c_str() + 7;
fnames_[i] = tmp;
}
fsize.push_back(GetFileSize(fnames_[i].c_str()));
size_t fz = GetFileSize(fnames_[i].c_str());
if (fz != 0) {
fsize_.push_back(fz);
}
}
LineSplitBase::Init(fsize, rank, nsplit);
}
virtual ~FileSplit(void) {}
protected:
virtual utils::ISeekStream *GetFile(size_t file_index) {
// destrucor
virtual ~FileProvider(void) {}
virtual utils::ISeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new FileStream(fnames_[file_index].c_str(), "rb");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
private:
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
// get file size
inline static size_t GetFileSize(const char *fname) {
std::FILE *fp = utils::FopenCheck(fname, "rb");
@@ -96,10 +105,6 @@ class FileSplit : public LineSplitBase {
std::fclose(fp);
return fsize;
}
private:
// file names
std::vector<std::string> fnames_;
};
} // namespace io
} // namespace rabit

View File

@@ -6,6 +6,7 @@
* \author Tianqi Chen
*/
#include <string>
#include <cstdlib>
#include <vector>
#include <hdfs.h>
#include <errno.h>
@@ -15,11 +16,15 @@
/*! \brief io interface */
namespace rabit {
namespace io {
class HDFSStream : public utils::ISeekStream {
class HDFSStream : public ISeekStream {
public:
HDFSStream(hdfsFS fs, const char *fname, const char *mode)
: fs_(fs), at_end_(false) {
int flag;
HDFSStream(hdfsFS fs,
const char *fname,
const char *mode,
bool disconnect_when_done)
: fs_(fs), at_end_(false),
disconnect_when_done_(disconnect_when_done) {
int flag = 0;
if (!strcmp(mode, "r")) {
flag = O_RDONLY;
} else if (!strcmp(mode, "w")) {
@@ -35,6 +40,9 @@ class HDFSStream : public utils::ISeekStream {
}
virtual ~HDFSStream(void) {
this->Close();
if (disconnect_when_done_) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
}
virtual size_t Read(void *ptr, size_t size) {
tSize nread = hdfsRead(fs_, fp_, ptr, size);
@@ -86,52 +94,69 @@ class HDFSStream : public utils::ISeekStream {
}
}
inline static std::string GetNameNode(void) {
const char *nn = getenv("rabit_hdfs_namenode");
if (nn == NULL) {
return std::string("default");
} else {
return std::string(nn);
}
}
private:
hdfsFS fs_;
hdfsFile fp_;
bool at_end_;
bool disconnect_when_done_;
};
/*! \brief line split from normal file system */
class HDFSSplit : public LineSplitBase {
class HDFSProvider : public LineSplitter::IFileProvider {
public:
explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
fs_ = hdfsConnect("default", 0);
explicit HDFSProvider(const char *uri) {
fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
std::vector<std::string> paths;
LineSplitBase::SplitNames(&paths, uri, "#");
LineSplitter::SplitNames(&paths, uri, "#");
// get the files
std::vector<size_t> fsize;
for (size_t i = 0; i < paths.size(); ++i) {
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
if (info->mKind == 'D') {
int nentry;
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
for (int i = 0; i < nentry; ++i) {
if (files[i].mKind == 'F') {
fsize.push_back(files[i].mSize);
if (files[i].mKind == 'F' && files[i].mSize != 0) {
fsize_.push_back(files[i].mSize);
fnames_.push_back(std::string(files[i].mName));
}
}
hdfsFreeFileInfo(files, nentry);
} else {
fsize.push_back(info->mSize);
fnames_.push_back(std::string(info->mName));
if (info->mSize != 0) {
fsize_.push_back(info->mSize);
fnames_.push_back(std::string(info->mName));
}
}
hdfsFreeFileInfo(info, 1);
}
LineSplitBase::Init(fsize, rank, nsplit);
}
virtual ~HDFSSplit(void) {}
protected:
virtual utils::ISeekStream *GetFile(size_t file_index) {
virtual ~HDFSProvider(void) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
virtual ISeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r");
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
}
private:
// hdfs handle
hdfsFS fs_;
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
};

View File

@@ -30,16 +30,16 @@ inline InputSplit *CreateInputSplit(const char *uri,
return new SingleFileSplit(uri);
}
if (!strncmp(uri, "file://", 7)) {
return new FileSplit(uri, part, nsplit);
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new HDFSSplit(uri, part, nsplit);
return new LineSplitter(new HDFSProvider(uri), part, nsplit);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif
}
return new FileSplit(uri, part, nsplit);
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
/*!
* \brief create an stream, the stream must be able to close
@@ -55,7 +55,8 @@ inline IStream *CreateStream(const char *uri, const char *mode) {
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new HDFSStream(hdfsConnect("default", 0), uri, mode);
return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
uri, mode, true);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif

View File

@@ -19,6 +19,7 @@ namespace rabit {
* \brief namespace to handle input split and filesystem interfacing
*/
namespace io {
/*! \brief reused ISeekStream's definition */
typedef utils::ISeekStream ISeekStream;
/*!
* \brief user facing input split helper,

View File

@@ -15,11 +15,42 @@
namespace rabit {
namespace io {
class LineSplitBase : public InputSplit {
/*! \brief class that split the files by line */
class LineSplitter : public InputSplit {
public:
virtual ~LineSplitBase() {
if (fs_ != NULL) delete fs_;
class IFileProvider {
public:
/*!
* \brief get the seek stream of given file_index
* \return the corresponding seek stream at head of the stream
* the seek stream's resource can be freed by calling delete
*/
virtual ISeekStream *Open(size_t file_index) = 0;
/*!
* \return const reference to size of each files
*/
virtual const std::vector<size_t> &FileSize(void) const = 0;
// virtual destructor
virtual ~IFileProvider() {}
};
// constructor
explicit LineSplitter(IFileProvider *provider,
unsigned rank,
unsigned nsplit)
: provider_(provider), fs_(NULL),
reader_(kBufferSize) {
this->Init(provider_->FileSize(), rank, nsplit);
}
// destructor
virtual ~LineSplitter() {
if (fs_ != NULL) {
delete fs_; fs_ = NULL;
}
// delete provider after destructing the streams
delete provider_;
}
// get next line
virtual bool NextLine(std::string *out_data) {
if (file_ptr_ >= file_ptr_end_ &&
offset_curr_ >= offset_end_) return false;
@@ -29,15 +60,15 @@ class LineSplitBase : public InputSplit {
if (reader_.AtEnd()) {
if (out_data->length() != 0) return true;
file_ptr_ += 1;
if (offset_curr_ >= offset_end_) return false;
if (offset_curr_ != file_offset_[file_ptr_]) {
utils::Error("warning:std::FILE size not calculated correctly\n");
utils::Error("warning: FILE size not calculated correctly\n");
offset_curr_ = file_offset_[file_ptr_];
}
if (offset_curr_ >= offset_end_) return false;
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
"boundary check");
delete fs_;
fs_ = this->GetFile(file_ptr_);
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
} else {
++offset_curr_;
@@ -51,15 +82,27 @@ class LineSplitBase : public InputSplit {
}
}
}
protected:
// constructor
LineSplitBase(void)
: fs_(NULL), reader_(kBufferSize) {
/*!
* \brief split names given
* \param out_fname output std::FILE names
* \param uri_ the iput uri std::FILE
* \param dlm deliminetr
*/
inline static void SplitNames(std::vector<std::string> *out_fname,
const char *uri_,
const char *dlm) {
std::string uri = uri_;
char *p = std::strtok(BeginPtr(uri), dlm);
while (p != NULL) {
out_fname->push_back(std::string(p));
p = std::strtok(NULL, dlm);
}
}
private:
/*!
* \brief initialize the line spliter,
* \param file_size, size of each std::FILEs
* \param file_size, size of each files
* \param rank the current rank of the data
* \param nsplit number of split we will divide the data into
*/
@@ -82,7 +125,7 @@ class LineSplitBase : public InputSplit {
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_end_) - file_offset_.begin() - 1;
fs_ = GetFile(file_ptr_);
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
// try to set the starting position correctly
if (file_offset_[file_ptr_] != offset_begin_) {
@@ -94,33 +137,15 @@ class LineSplitBase : public InputSplit {
}
}
}
/*!
* \brief get the seek stream of given file_index
* \return the corresponding seek stream at head of std::FILE
*/
virtual utils::ISeekStream *GetFile(size_t file_index) = 0;
/*!
* \brief split names given
* \param out_fname output std::FILE names
* \param uri_ the iput uri std::FILE
* \param dlm deliminetr
*/
inline static void SplitNames(std::vector<std::string> *out_fname,
const char *uri_,
const char *dlm) {
std::string uri = uri_;
char *p = std::strtok(BeginPtr(uri), dlm);
while (p != NULL) {
out_fname->push_back(std::string(p));
p = std::strtok(NULL, dlm);
}
}
private:
/*! \brief FileProvider */
IFileProvider *provider_;
/*! \brief current input stream */
utils::ISeekStream *fs_;
/*! \brief std::FILE pointer of which std::FILE to read on */
/*! \brief file pointer of which file to read on */
size_t file_ptr_;
/*! \brief std::FILE pointer where the end of std::FILE lies */
/*! \brief file pointer where the end of file lies */
size_t file_ptr_end_;
/*! \brief get the current offset */
size_t offset_curr_;
@@ -128,7 +153,7 @@ class LineSplitBase : public InputSplit {
size_t offset_begin_;
/*! \brief end of the offset */
size_t offset_end_;
/*! \brief byte-offset of each std::FILE */
/*! \brief byte-offset of each file */
std::vector<size_t> file_offset_;
/*! \brief buffer reader */
StreamBufferReader reader_;

View File

@@ -1,4 +1,10 @@
# specify tensor path
ifneq ("$(wildcard ../config.mk)","")
config = ../config.mk
else
config = ../make/config.mk
endif
include $(config)
BIN = linear.rabit
MOCKBIN= linear.mock
MPIBIN =
@@ -6,10 +12,10 @@ MPIBIN =
OBJ = linear.o
# common build script for programs
include ../make/config.mk
include ../make/common.mk
CFLAGS+=-fopenmp
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
# dependenies here
linear.rabit: linear.o lib
linear.mock: linear.o lib

View File

@@ -206,21 +206,22 @@ int main(int argc, char *argv[]) {
rabit::Finalize();
return 0;
}
rabit::linear::LinearObjFunction linear;
rabit::linear::LinearObjFunction *linear = new rabit::linear::LinearObjFunction();
if (!strcmp(argv[1], "stdin")) {
linear.LoadData(argv[1]);
linear->LoadData(argv[1]);
rabit::Init(argc, argv);
} else {
rabit::Init(argc, argv);
linear.LoadData(argv[1]);
linear->LoadData(argv[1]);
}
for (int i = 2; i < argc; ++i) {
char name[256], val[256];
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
linear.SetParam(name, val);
linear->SetParam(name, val);
}
}
linear.Run();
linear->Run();
delete linear;
rabit::Finalize();
return 0;
}

View File

@@ -26,10 +26,11 @@ struct LinearModel {
int reserved[16];
// constructor
ModelParam(void) {
memset(this, 0, sizeof(ModelParam));
base_score = 0.5f;
num_feature = 0;
loss_type = 1;
std::memset(reserved, 0, sizeof(reserved));
num_feature = 0;
}
// initialize base score
inline void InitBaseScore(void) {
@@ -119,7 +120,7 @@ struct LinearModel {
}
fi.Read(weight, sizeof(float) * (param.num_feature + 1));
}
inline void Save(rabit::IStream &fo, const float *wptr = NULL) const {
inline void Save(rabit::IStream &fo, const float *wptr = NULL) {
fo.Write(&param, sizeof(param));
if (wptr == NULL) wptr = weight;
fo.Write(wptr, sizeof(float) * (param.num_feature + 1));

View File

@@ -6,12 +6,13 @@ then
fi
# put the local training file to HDFS
hadoop fs -rm -r -f $2/data
hadoop fs -rm -r -f $2/mushroom.linear.model
hadoop fs -mkdir $2/data
hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop
../../tracker/rabit_yarn.py -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
../../tracker/rabit_yarn.py -n $1 --vcores 1 ./linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model ./linear.model

View File

@@ -6,7 +6,7 @@
#
# - copy this file to the root of rabit-learn folder
# - modify the configuration you want
# - type make or make -j n for parallel build
# - type make or make -j n on each of the folder
#----------------------------------------------------
# choice of compiler

View File

@@ -145,8 +145,9 @@ class LBFGSSolver {
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu\n",
gstate.num_dim, gstate.init_objval, gstate.size_memory);
("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu, RAM-approx=%lu\n",
gstate.num_dim, gstate.init_objval, gstate.size_memory,
gstate.MemCost() + hist.MemCost());
}
}
}
@@ -176,7 +177,7 @@ class LBFGSSolver {
// swap new weight
std::swap(g.weight, g.grad);
// check stop condition
if (gstate.num_iteration > min_lbfgs_iter) {
if (gstate.num_iteration > static_cast<size_t>(min_lbfgs_iter)) {
if (g.old_objval - g.new_objval < lbfgs_stop_tol * g.init_objval) {
return true;
}
@@ -195,7 +196,7 @@ class LBFGSSolver {
/*! \brief run optimization */
virtual void Run(void) {
this->Init();
while (gstate.num_iteration < max_lbfgs_iter) {
while (gstate.num_iteration < static_cast<size_t>(max_lbfgs_iter)) {
if (this->UpdateOneIter()) break;
}
if (silent == 0 && rabit::GetRank() == 0) {
@@ -225,7 +226,7 @@ class LBFGSSolver {
const size_t num_dim = gstate.num_dim;
const DType *gsub = grad + range_begin_;
const size_t nsub = range_end_ - range_begin_;
double vdot;
double vdot = 0.0;
if (n != 0) {
// hist[m + n - 1] stores old gradient
Minus(hist[m + n - 1], gsub, hist[m + n - 1], nsub);
@@ -241,15 +242,19 @@ class LBFGSSolver {
idxset.push_back(std::make_pair(m + j, 2 * m));
idxset.push_back(std::make_pair(m + j, m + n - 1));
}
// calculate dot products
std::vector<double> tmp(idxset.size());
for (size_t i = 0; i < tmp.size(); ++i) {
tmp[i] = hist.CalcDot(idxset[i].first, idxset[i].second);
}
rabit::Allreduce<rabit::op::Sum>(BeginPtr(tmp), tmp.size());
for (size_t i = 0; i < tmp.size(); ++i) {
gstate.DotBuf(idxset[i].first, idxset[i].second) = tmp[i];
}
// BFGS steps, use vector-free update
// parameterize vector using basis in hist
std::vector<double> alpha(n);
@@ -263,7 +268,7 @@ class LBFGSSolver {
}
alpha[j] = vsum / gstate.DotBuf(j, m + j);
delta[m + j] = delta[m + j] - alpha[j];
}
}
// scale
double scale = gstate.DotBuf(n - 1, m + n - 1) /
gstate.DotBuf(m + n - 1, m + n - 1);
@@ -279,6 +284,7 @@ class LBFGSSolver {
double beta = vsum / gstate.DotBuf(j, m + j);
delta[j] = delta[j] + (alpha[j] - beta);
}
// set all to zero
std::fill(dir, dir + num_dim, 0.0f);
DType *dirsub = dir + range_begin_;
@@ -291,10 +297,11 @@ class LBFGSSolver {
}
FixDirL1Sign(dirsub, hist[2 * m], nsub);
vdot = -Dot(dirsub, hist[2 * m], nsub);
// allreduce to get full direction
rabit::Allreduce<rabit::op::Sum>(dir, num_dim);
rabit::Allreduce<rabit::op::Sum>(&vdot, 1);
} else {
} else {
SetL1Dir(dir, grad, weight, num_dim);
vdot = -Dot(dir, dir, num_dim);
}
@@ -482,6 +489,7 @@ class LBFGSSolver {
num_iteration = 0;
num_dim = 0;
old_objval = 0.0;
offset_ = 0;
}
~GlobalState(void) {
if (grad != NULL) {
@@ -496,6 +504,10 @@ class LBFGSSolver {
data.resize(n * n, 0.0);
this->AllocSpace();
}
// memory cost
inline size_t MemCost(void) const {
return sizeof(DType) * 3 * num_dim;
}
inline double &DotBuf(size_t i, size_t j) {
if (i > j) std::swap(i, j);
return data[MapIndex(i, offset_, size_memory) * (size_memory * 2 + 1) +
@@ -565,6 +577,10 @@ class LBFGSSolver {
size_t n = size_memory * 2 + 1;
dptr_ = new DType[n * stride_];
}
// memory cost
inline size_t MemCost(void) const {
return sizeof(DType) * (size_memory_ * 2 + 1) * stride_;
}
// fetch element from rolling array
inline const DType *operator[](size_t i) const {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;

View File

@@ -77,11 +77,15 @@ struct SparseMat {
feat_dim += 1;
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
"feature dimension exceed limit of index_t"\
"consider change the index_t to unsigned long");
"consider change the index_t to unsigned long");
}
inline size_t NumRow(void) const {
return row_ptr.size() - 1;
}
// memory cost
inline size_t MemCost(void) const {
return data.size() * sizeof(Entry);
}
// maximum feature dimension
size_t feat_dim;
std::vector<size_t> row_ptr;