Merge commit 'a30045c7cc54344e2084fb1fa3e01bfafc737188'

This commit is contained in:
tqchen 2015-04-11 20:26:57 -07:00
commit f55f8f023f
34 changed files with 176 additions and 10902 deletions

View File

@ -5,7 +5,8 @@ rabit is a light weight library that provides a fault tolerant interface of Allr
* [Tutorial](guide)
* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
* You can also directly read the [interface header](include/rabit.h)
* [Machine Learning Tools](rabit-learn)
* [Distributed Machine Learning Tools](https://github.com/dmlc/wormhole)
- Rabit is one of the backbone library to support wormhole machine learning tools
Features
====
@ -33,5 +34,4 @@ Contributing
Rabit is an open-source library, contributions are welcomed, including:
* The rabit core library.
* Customized tracker script for new platforms and interface of new languages.
* Toolkits, benchmarks, resource (links to related repos).
* Tutorial and examples about the library.

View File

@ -151,7 +151,7 @@ This section trys to gives examples of different aspectes of rabit API.
#### Structure of a Rabit Program
The following code illustrates the common structure of a rabit program. This is an abstract example,
you can also refer to [kmeans.cc](../rabit-learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
you can also refer to [wormhole](https://github.com/dmlc/wormhole/blob/master/learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
```c++
#include <rabit.h>

View File

@ -8,6 +8,10 @@
#include <cstdio>
#include <string>
#include <vector>
#include <istream>
#include <ostream>
#include <streambuf>
#include <cassert>
/*! \brief namespace for dmlc */
namespace dmlc {
@ -38,7 +42,6 @@ class Stream {
* \param uri the uri of the input currently we support
* hdfs://, s3://, and file:// by default file:// will be used
* \param flag can be "w", "r", "a"
* \return a created stream
*/
static Stream *Create(const char *uri, const char* const flag);
// helper functions to write/read different data structures
@ -103,11 +106,14 @@ class Serializable {
class InputSplit {
public:
/*!
* \brief read next line, store into out_data
* \brief read next record, store into out_data
* the data in outcomming record depends on the input data format
* if input is text data, each line is returned as a record (\n not included)
* if input is recordio, each record is returned
* \param out_data the string that stores the line data, \n is not included
* \return true of next line was found, false if we read all the lines
*/
virtual bool ReadLine(std::string *out_data) = 0;
virtual bool ReadRecord(std::string *out_data) = 0;
/*! \brief destructor*/
virtual ~InputSplit(void) {}
/*!
@ -116,13 +122,133 @@ class InputSplit {
* \param uri the uri of the input, can contain hdfs prefix
* \param part_index the part id of current input
* \param num_parts total number of splits
* \return a created input split
*/
static InputSplit* Create(const char *uri,
unsigned part_index,
unsigned num_parts);
};
/*!
* \brief a std::ostream class that can can wrap Stream objects,
* can use ostream with that output to underlying Stream
*
* Usage example:
* \code
*
* Stream *fs = Stream::Create("hdfs:///test.txt", "w");
* dmlc::ostream os(fs);
* os << "hello world" << std::endl;
* delete fs;
* \endcode
*/
class ostream : public std::basic_ostream<char> {
public:
/*!
* \brief construct std::ostream type
* \param stream the Stream output to be used
* \param buffer_size internal streambuf size
*/
explicit ostream(Stream *stream,
size_t buffer_size = 1 << 10)
: basic_ostream<char>(NULL), buf_(buffer_size) {
this->set_stream(stream);
}
// explictly synchronize the buffer
virtual ~ostream() {
buf_.pubsync();
}
/*!
* \brief set internal stream to be stream, reset states
* \param stream new stream as output
*/
inline void set_stream(Stream *stream) {
buf_.set_stream(stream);
this->rdbuf(&buf_);
}
private:
// internal streambuf
class OutBuf : public std::streambuf {
public:
explicit OutBuf(size_t buffer_size)
: stream_(NULL), buffer_(buffer_size) {
assert(buffer_.size() > 0);
}
// set stream to the buffer
inline void set_stream(Stream *stream);
private:
/*! \brief internal stream by StreamBuf */
Stream *stream_;
/*! \brief internal buffer */
std::vector<char> buffer_;
// override sync
inline int_type sync(void);
// override overflow
inline int_type overflow(int c);
};
/*! \brief buffer of the stream */
OutBuf buf_;
};
/*!
* \brief a std::istream class that can can wrap Stream objects,
* can use istream with that output to underlying Stream
*
* Usage example:
* \code
*
* Stream *fs = Stream::Create("hdfs:///test.txt", "r");
* dmlc::istream is(fs);
* is >> mydata;
* delete fs;
* \endcode
*/
class istream : public std::basic_istream<char> {
public:
/*!
* \brief construct std::ostream type
* \param stream the Stream output to be used
* \param buffer_size internal buffer size
*/
explicit istream(Stream *stream,
size_t buffer_size = 1 << 10)
: basic_istream<char>(NULL), buf_(buffer_size) {
this->set_stream(stream);
}
virtual ~istream() {}
/*!
* \brief set internal stream to be stream, reset states
* \param stream new stream as output
*/
inline void set_stream(Stream *stream) {
buf_.set_stream(stream);
this->rdbuf(&buf_);
}
private:
// internal streambuf
class InBuf : public std::streambuf {
public:
explicit InBuf(size_t buffer_size)
: stream_(NULL), buffer_(buffer_size) {
assert(buffer_.size() > 0);
}
// set stream to the buffer
inline void set_stream(Stream *stream);
private:
/*! \brief internal stream by StreamBuf */
Stream *stream_;
/*! \brief internal buffer */
std::vector<char> buffer_;
// override underflow
inline int_type underflow();
};
/*! \brief input buffer */
InBuf buf_;
};
// implementations of inline functions
template<typename T>
inline void Stream::Write(const std::vector<T> &vec) {
@ -160,5 +286,48 @@ inline bool Stream::Read(std::string *out_str) {
}
return true;
}
// implementations for ostream
inline void ostream::OutBuf::set_stream(Stream *stream) {
if (stream_ != NULL) this->pubsync();
this->stream_ = stream;
this->setp(&buffer_[0], &buffer_[0] + buffer_.size() - 1);
}
inline int ostream::OutBuf::sync(void) {
if (stream_ == NULL) return -1;
std::ptrdiff_t n = pptr() - pbase();
stream_->Write(pbase(), n);
this->pbump(-n);
return 0;
}
inline int ostream::OutBuf::overflow(int c) {
*(this->pptr()) = c;
std::ptrdiff_t n = pptr() - pbase();
this->pbump(-n);
if (c == EOF) {
stream_->Write(pbase(), n);
} else {
stream_->Write(pbase(), n + 1);
}
return c;
}
// implementations for istream
inline void istream::InBuf::set_stream(Stream *stream) {
stream_ = stream;
this->setg(&buffer_[0], &buffer_[0], &buffer_[0]);
}
inline int istream::InBuf::underflow() {
char *bhead = &buffer_[0];
if (this->gptr() == this->egptr()) {
size_t sz = stream_->Read(bhead, buffer_.size());
this->setg(bhead, bhead, bhead + sz);
}
if (this->gptr() == this->egptr()) {
return traits_type::eof();
} else {
return traits_type::to_int_type(*gptr());
}
}
} // namespace dmlc
#endif // DMLC_IO_H_

View File

@ -1,2 +0,0 @@
config.mk
*.log

View File

@ -1,17 +0,0 @@
Rabit-Learn
====
This folder contains implementation of distributed machine learning algorithm using rabit.
It also contain links to the Machine Learning packages that uses rabit.
* Contribution of toolkits, examples, benchmarks is more than welcomed!
Toolkits
====
* [KMeans Clustering](kmeans)
* [Linear and Logistic Regression](linear)
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
10 times faster than existing packages
- Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
single node version, and scale it to even larger problems

View File

@ -1,2 +0,0 @@
This folder contains processed example dataset used by the demos.
Copyright of the dataset belongs to the original copyright holder

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -1,218 +0,0 @@
#ifndef RABIT_LEARN_IO_BASE64_INL_H_
#define RABIT_LEARN_IO_BASE64_INL_H_
/*!
* \file base64.h
* \brief data stream support to input and output from/to base64 stream
* base64 is easier to store and pass as text format in mapreduce
* \author Tianqi Chen
*/
#include <cctype>
#include <cstdio>
#include "./io.h"
#include "./buffer_reader-inl.h"
namespace rabit {
namespace io {
/*! \brief namespace of base64 decoding and encoding table */
namespace base64 {
const char DecodeTable[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62, // '+'
0, 0, 0,
63, // '/'
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
0, 0, 0, 0, 0, 0,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
};
static const char EncodeTable[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
} // namespace base64
/*! \brief the stream that reads from base64, note we take from file pointers */
class Base64InStream: public Stream {
public:
explicit Base64InStream(Stream *fs) : reader_(256) {
reader_.set_stream(fs);
num_prev = 0; tmp_ch = 0;
}
/*!
* \brief initialize the stream position to beginning of next base64 stream
* call this function before actually start read
*/
inline void InitPosition(void) {
// get a charater
do {
tmp_ch = reader_.GetChar();
} while (isspace(tmp_ch));
}
/*! \brief whether current position is end of a base64 stream */
inline bool IsEOF(void) const {
return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
}
virtual size_t Read(void *ptr, size_t size) {
using base64::DecodeTable;
if (size == 0) return 0;
// use tlen to record left size
size_t tlen = size;
unsigned char *cptr = static_cast<unsigned char*>(ptr);
// if anything left, load from previous buffered result
if (num_prev != 0) {
if (num_prev == 2) {
if (tlen >= 2) {
*cptr++ = buf_prev[0];
*cptr++ = buf_prev[1];
tlen -= 2;
num_prev = 0;
} else {
// assert tlen == 1
*cptr++ = buf_prev[0]; --tlen;
buf_prev[0] = buf_prev[1];
num_prev = 1;
}
} else {
// assert num_prev == 1
*cptr++ = buf_prev[0]; --tlen; num_prev = 0;
}
}
if (tlen == 0) return size;
int nvalue;
// note: everything goes with 4 bytes in Base64
// so we process 4 bytes a unit
while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
// first byte
nvalue = DecodeTable[tmp_ch] << 18;
{
// second byte
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
nvalue |= DecodeTable[tmp_ch] << 12;
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
}
{
// third byte
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
// handle termination
if (tmp_ch == '=') {
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
"invalid base64 format");
break;
}
nvalue |= DecodeTable[tmp_ch] << 6;
if (tlen) {
*cptr++ = (nvalue >> 8) & 0xFF; --tlen;
} else {
buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
}
}
{
// fourth byte
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
if (tmp_ch == '=') {
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
"invalid base64 format");
break;
}
nvalue |= DecodeTable[tmp_ch];
if (tlen) {
*cptr++ = nvalue & 0xFF; --tlen;
} else {
buf_prev[num_prev ++] = nvalue & 0xFF;
}
}
// get next char
tmp_ch = reader_.GetChar();
}
if (kStrictCheck) {
utils::Check(tlen == 0, "Base64InStream: read incomplete");
}
return size - tlen;
}
virtual void Write(const void *ptr, size_t size) {
utils::Error("Base64InStream do not support write");
}
private:
StreamBufferReader reader_;
int tmp_ch;
int num_prev;
unsigned char buf_prev[2];
// whether we need to do strict check
static const bool kStrictCheck = false;
};
/*! \brief the stream that write to base64, note we take from file pointers */
class Base64OutStream: public Stream {
public:
explicit Base64OutStream(Stream *fp) : fp(fp) {
buf_top = 0;
}
virtual void Write(const void *ptr, size_t size) {
using base64::EncodeTable;
size_t tlen = size;
const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
while (tlen) {
while (buf_top < 3 && tlen != 0) {
buf[++buf_top] = *cptr++; --tlen;
}
if (buf_top == 3) {
// flush 4 bytes out
PutChar(EncodeTable[buf[1] >> 2]);
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
PutChar(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F]);
PutChar(EncodeTable[buf[3] & 0x3F]);
buf_top = 0;
}
}
}
virtual size_t Read(void *ptr, size_t size) {
utils::Error("Base64OutStream do not support read");
return 0;
}
/*!
* \brief finish writing of all current base64 stream, do some post processing
* \param endch charater to put to end of stream, if it is EOF, then nothing will be done
*/
inline void Finish(char endch = EOF) {
using base64::EncodeTable;
if (buf_top == 1) {
PutChar(EncodeTable[buf[1] >> 2]);
PutChar(EncodeTable[(buf[1] << 4) & 0x3F]);
PutChar('=');
PutChar('=');
}
if (buf_top == 2) {
PutChar(EncodeTable[buf[1] >> 2]);
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
PutChar(EncodeTable[(buf[2] << 2) & 0x3F]);
PutChar('=');
}
buf_top = 0;
if (endch != EOF) PutChar(endch);
this->Flush();
}
private:
Stream *fp;
int buf_top;
unsigned char buf[4];
std::string out_buf;
const static size_t kBufferSize = 256;
inline void PutChar(char ch) {
out_buf += ch;
if (out_buf.length() >= kBufferSize) Flush();
}
inline void Flush(void) {
fp->Write(BeginPtr(out_buf), out_buf.length());
out_buf.clear();
}
};
} // namespace utils
} // namespace rabit
#endif // RABIT_LEARN_UTILS_BASE64_INL_H_

View File

@ -1,58 +0,0 @@
#ifndef RABIT_LEARN_IO_BUFFER_READER_INL_H_
#define RABIT_LEARN_IO_BUFFER_READER_INL_H_
/*!
* \file buffer_reader-inl.h
* \brief implementation of stream buffer reader
* \author Tianqi Chen
*/
#include "./io.h"
namespace rabit {
namespace io {
/*! \brief buffer reader of the stream that allows you to get */
class StreamBufferReader {
public:
StreamBufferReader(size_t buffer_size)
:stream_(NULL),
read_len_(1), read_ptr_(1) {
buffer_.resize(buffer_size);
}
/*!
* \brief set input stream
*/
inline void set_stream(Stream *stream) {
stream_ = stream;
read_len_ = read_ptr_ = 1;
}
/*!
* \brief allows quick read using get char
*/
inline char GetChar(void) {
while (true) {
if (read_ptr_ < read_len_) {
return buffer_[read_ptr_++];
} else {
read_len_ = stream_->Read(&buffer_[0], buffer_.length());
if (read_len_ == 0) return EOF;
read_ptr_ = 0;
}
}
}
/*! \brief whether we are reaching the end of file */
inline bool AtEnd(void) const {
return read_len_ == 0;
}
private:
/*! \brief the underlying stream */
Stream *stream_;
/*! \brief buffer to hold data */
std::string buffer_;
/*! \brief length of valid data in buffer */
size_t read_len_;
/*! \brief pointer in the buffer */
size_t read_ptr_;
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_BUFFER_READER_INL_H_

View File

@ -1,112 +0,0 @@
#ifndef RABIT_LEARN_IO_FILE_INL_H_
#define RABIT_LEARN_IO_FILE_INL_H_
/*!
* \file file-inl.h
* \brief normal filesystem I/O
* \author Tianqi Chen
*/
#include <string>
#include <vector>
#include <cstdio>
#include "./io.h"
#include "./line_split-inl.h"
/*! \brief io interface */
namespace rabit {
namespace io {
/*! \brief implementation of file i/o stream */
class FileStream : public utils::SeekStream {
public:
explicit FileStream(const char *fname, const char *mode)
: use_stdio(false) {
using namespace std;
#ifndef RABIT_STRICT_CXX98_
if (!strcmp(fname, "stdin")) {
use_stdio = true; fp = stdin;
}
if (!strcmp(fname, "stdout")) {
use_stdio = true; fp = stdout;
}
#endif
if (!strncmp(fname, "file://", 7)) fname += 7;
if (!use_stdio) {
std::string flag = mode;
if (flag == "w") flag = "wb";
if (flag == "r") flag = "rb";
fp = utils::FopenCheck(fname, flag.c_str());
}
}
virtual ~FileStream(void) {
this->Close();
}
virtual size_t Read(void *ptr, size_t size) {
return std::fread(ptr, 1, size, fp);
}
virtual void Write(const void *ptr, size_t size) {
std::fwrite(ptr, size, 1, fp);
}
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
}
virtual size_t Tell(void) {
return std::ftell(fp);
}
virtual bool AtEnd(void) const {
return std::feof(fp) != 0;
}
inline void Close(void) {
if (fp != NULL && !use_stdio) {
std::fclose(fp); fp = NULL;
}
}
private:
std::FILE *fp;
bool use_stdio;
};
/*! \brief line split from normal file system */
class FileProvider : public LineSplitter::IFileProvider {
public:
explicit FileProvider(const char *uri) {
LineSplitter::SplitNames(&fnames_, uri, "#");
std::vector<size_t> fsize;
for (size_t i = 0; i < fnames_.size(); ++i) {
if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
std::string tmp = fnames_[i].c_str() + 7;
fnames_[i] = tmp;
}
size_t fz = GetFileSize(fnames_[i].c_str());
if (fz != 0) {
fsize_.push_back(fz);
}
}
}
// destrucor
virtual ~FileProvider(void) {}
virtual utils::SeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new FileStream(fnames_[file_index].c_str(), "rb");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
private:
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
// get file size
inline static size_t GetFileSize(const char *fname) {
std::FILE *fp = utils::FopenCheck(fname, "rb");
// NOTE: fseek may not be good, but serves as ok solution
std::fseek(fp, 0, SEEK_END);
size_t fsize = static_cast<size_t>(std::ftell(fp));
std::fclose(fp);
return fsize;
}
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_FILE_INL_H_

View File

@ -1,165 +0,0 @@
#ifndef RABIT_LEARN_IO_HDFS_INL_H_
#define RABIT_LEARN_IO_HDFS_INL_H_
/*!
* \file hdfs-inl.h
* \brief HDFS I/O
* \author Tianqi Chen
*/
#include <string>
#include <cstdlib>
#include <vector>
#include <hdfs.h>
#include <errno.h>
#include "./io.h"
#include "./line_split-inl.h"
/*! \brief io interface */
namespace rabit {
namespace io {
class HDFSStream : public SeekStream {
public:
HDFSStream(hdfsFS fs,
const char *fname,
const char *mode,
bool disconnect_when_done)
: fs_(fs), at_end_(false),
disconnect_when_done_(disconnect_when_done) {
int flag = 0;
if (!strcmp(mode, "r")) {
flag = O_RDONLY;
} else if (!strcmp(mode, "w")) {
flag = O_WRONLY;
} else if (!strcmp(mode, "a")) {
flag = O_WRONLY | O_APPEND;
} else {
utils::Error("HDFSStream: unknown flag %s", mode);
}
fp_ = hdfsOpenFile(fs_, fname, flag, 0, 0, 0);
utils::Check(fp_ != NULL,
"HDFSStream: fail to open %s", fname);
}
virtual ~HDFSStream(void) {
this->Close();
if (disconnect_when_done_) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
}
virtual size_t Read(void *ptr, size_t size) {
tSize nread = hdfsRead(fs_, fp_, ptr, size);
if (nread == -1) {
int errsv = errno;
utils::Error("HDFSStream.Read Error:%s", strerror(errsv));
}
if (nread == 0) {
at_end_ = true;
}
return static_cast<size_t>(nread);
}
virtual void Write(const void *ptr, size_t size) {
const char *buf = reinterpret_cast<const char*>(ptr);
while (size != 0) {
tSize nwrite = hdfsWrite(fs_, fp_, buf, size);
if (nwrite == -1) {
int errsv = errno;
utils::Error("HDFSStream.Write Error:%s", strerror(errsv));
}
size_t sz = static_cast<size_t>(nwrite);
buf += sz; size -= sz;
}
}
virtual void Seek(size_t pos) {
if (hdfsSeek(fs_, fp_, pos) != 0) {
int errsv = errno;
utils::Error("HDFSStream.Seek Error:%s", strerror(errsv));
}
}
virtual size_t Tell(void) {
tOffset offset = hdfsTell(fs_, fp_);
if (offset == -1) {
int errsv = errno;
utils::Error("HDFSStream.Tell Error:%s", strerror(errsv));
}
return static_cast<size_t>(offset);
}
virtual bool AtEnd(void) const {
return at_end_;
}
inline void Close(void) {
if (fp_ != NULL) {
if (hdfsCloseFile(fs_, fp_) == -1) {
int errsv = errno;
utils::Error("HDFSStream.Close Error:%s", strerror(errsv));
}
fp_ = NULL;
}
}
inline static std::string GetNameNode(void) {
const char *nn = getenv("rabit_hdfs_namenode");
if (nn == NULL) {
return std::string("default");
} else {
return std::string(nn);
}
}
private:
hdfsFS fs_;
hdfsFile fp_;
bool at_end_;
bool disconnect_when_done_;
};
/*! \brief line split from normal file system */
class HDFSProvider : public LineSplitter::IFileProvider {
public:
explicit HDFSProvider(const char *uri) {
fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
std::vector<std::string> paths;
LineSplitter::SplitNames(&paths, uri, "#");
// get the files
for (size_t i = 0; i < paths.size(); ++i) {
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
if (info->mKind == 'D') {
int nentry;
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
for (int i = 0; i < nentry; ++i) {
if (files[i].mKind == 'F' && files[i].mSize != 0) {
fsize_.push_back(files[i].mSize);
fnames_.push_back(std::string(files[i].mName));
}
}
hdfsFreeFileInfo(files, nentry);
} else {
if (info->mSize != 0) {
fsize_.push_back(info->mSize);
fnames_.push_back(std::string(info->mName));
}
}
hdfsFreeFileInfo(info, 1);
}
}
virtual ~HDFSProvider(void) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
virtual SeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
}
private:
// hdfs handle
hdfsFS fs_;
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_HDFS_INL_H_

View File

@ -1,99 +0,0 @@
#ifndef RABIT_LEARN_IO_IO_INL_H_
#define RABIT_LEARN_IO_IO_INL_H_
/*!
* \file io-inl.h
* \brief Input/Output utils that handles read/write
* of files in distrubuted enviroment
* \author Tianqi Chen
*/
#include <cstring>
#include "./io.h"
#if RABIT_USE_WORMHOLE == 0
#if RABIT_USE_HDFS
#include "./hdfs-inl.h"
#endif
#include "./file-inl.h"
#endif
namespace rabit {
namespace io {
/*!
* \brief create input split given a uri
* \param uri the uri of the input, can contain hdfs prefix
* \param part the part id of current input
* \param nsplit total number of splits
*/
inline InputSplit *CreateInputSplit(const char *uri,
unsigned part,
unsigned nsplit) {
#if RABIT_USE_WORMHOLE
return dmlc::InputSplit::Create(uri, part, nsplit);
#else
using namespace std;
if (!strcmp(uri, "stdin")) {
return new SingleFileSplit(uri);
}
if (!strncmp(uri, "file://", 7)) {
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new LineSplitter(new HDFSProvider(uri), part, nsplit);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif
}
return new LineSplitter(new FileProvider(uri), part, nsplit);
#endif
}
template<typename TStream>
class StreamAdapter : public Stream {
public:
explicit StreamAdapter(TStream *stream)
: stream_(stream) {
}
virtual ~StreamAdapter(void) {
delete stream_;
}
virtual size_t Read(void *ptr, size_t size) {
return stream_->Read(ptr, size);
}
virtual void Write(const void *ptr, size_t size) {
stream_->Write(ptr, size);
}
private:
TStream *stream_;
};
/*!
* \brief create an stream, the stream must be able to close
* the underlying resources(files) when deleted
*
* \param uri the uri of the input, can contain hdfs prefix
* \param mode can be 'w' or 'r' for read or write
*/
inline Stream *CreateStream(const char *uri, const char *mode) {
#if RABIT_USE_WORMHOLE
return new StreamAdapter<dmlc::Stream>(dmlc::Stream::Create(uri, mode));
#else
using namespace std;
if (!strncmp(uri, "file://", 7)) {
return new FileStream(uri + 7, mode);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
uri, mode, true);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif
}
return new FileStream(uri, mode);
#endif
}
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_IO_INL_H_

View File

@ -1,74 +0,0 @@
#ifndef RABIT_LEARN_IO_IO_H_
#define RABIT_LEARN_IO_IO_H_
/*!
* \file io.h
* \brief Input/Output utils that handles read/write
* of files in distrubuted enviroment
* \author Tianqi Chen
*/
#include "../../include/rabit_serializable.h"
/*! \brief whether compile with HDFS support */
#ifndef RABIT_USE_HDFS
#define RABIT_USE_HDFS 0
#endif
#ifndef RABIT_USE_WORMHOLE
#define RABIT_USE_WORMHOLE 0
#endif
#if RABIT_USE_WORMHOLE
#include <dmlc/io.h>
#endif
/*! \brief io interface */
namespace rabit {
/*!
* \brief namespace to handle input split and filesystem interfacing
*/
namespace io {
/*! \brief reused SeekStream's definition */
#if RABIT_USE_WORMHOLE
typedef dmlc::SeekStream SeekStream;
typedef dmlc::InputSplit InputSplit;
#else
typedef utils::SeekStream SeekStream;
/*!
* \brief user facing input split helper,
* can be used to get the partition of data used by current node
*/
class InputSplit {
public:
/*!
* \brief get next line, store into out_data
* \param out_data the string that stores the line data,
* \n is not included
* \return true of next line was found, false if we read all the lines
*/
virtual bool ReadLine(std::string *out_data) = 0;
/*! \brief destructor*/
virtual ~InputSplit(void) {}
};
#endif
/*!
* \brief create input split given a uri
* \param uri the uri of the input, can contain hdfs prefix
* \param part the part id of current input
* \param nsplit total number of splits
*/
inline InputSplit *CreateInputSplit(const char *uri,
unsigned part,
unsigned nsplit);
/*!
* \brief create an stream, the stream must be able to close
* the underlying resources(files) when deleted
*
* \param uri the uri of the input, can contain hdfs prefix
* \param mode can be 'w' or 'r' for read or write
*/
inline Stream *CreateStream(const char *uri, const char *mode);
} // namespace io
} // namespace rabit
#include "./io-inl.h"
#include "./base64-inl.h"
#endif // RABIT_LEARN_IO_IO_H_

View File

@ -1,206 +0,0 @@
#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
/*!
* \std::FILE line_split-inl.h
* \brief base implementation of line-spliter
* \author Tianqi Chen
*/
#include <vector>
#include <utility>
#include <cstring>
#include <string>
#include "../../include/rabit.h"
#include "./io.h"
#include "./buffer_reader-inl.h"
namespace rabit {
namespace io {
/*! \brief class that split the files by line */
class LineSplitter : public InputSplit {
public:
class IFileProvider {
public:
/*!
* \brief get the seek stream of given file_index
* \return the corresponding seek stream at head of the stream
* the seek stream's resource can be freed by calling delete
*/
virtual SeekStream *Open(size_t file_index) = 0;
/*!
* \return const reference to size of each files
*/
virtual const std::vector<size_t> &FileSize(void) const = 0;
// virtual destructor
virtual ~IFileProvider() {}
};
// constructor
explicit LineSplitter(IFileProvider *provider,
unsigned rank,
unsigned nsplit)
: provider_(provider), fs_(NULL),
reader_(kBufferSize) {
this->Init(provider_->FileSize(), rank, nsplit);
}
// destructor
virtual ~LineSplitter() {
if (fs_ != NULL) {
delete fs_; fs_ = NULL;
}
// delete provider after destructing the streams
delete provider_;
}
// get next line
virtual bool ReadLine(std::string *out_data) {
if (file_ptr_ >= file_ptr_end_ &&
offset_curr_ >= offset_end_) return false;
out_data->clear();
while (true) {
char c = reader_.GetChar();
if (reader_.AtEnd()) {
if (out_data->length() != 0) return true;
file_ptr_ += 1;
if (offset_curr_ >= offset_end_) return false;
if (offset_curr_ != file_offset_[file_ptr_]) {
utils::Error("warning: FILE size not calculated correctly\n");
offset_curr_ = file_offset_[file_ptr_];
}
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
"boundary check");
delete fs_;
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
} else {
++offset_curr_;
if (c != '\r' && c != '\n' && c != EOF) {
*out_data += c;
} else {
if (out_data->length() != 0) return true;
if (file_ptr_ >= file_ptr_end_ &&
offset_curr_ >= offset_end_) return false;
}
}
}
}
/*!
* \brief split names given
* \param out_fname output std::FILE names
* \param uri_ the iput uri std::FILE
* \param dlm deliminetr
*/
inline static void SplitNames(std::vector<std::string> *out_fname,
const char *uri_,
const char *dlm) {
std::string uri = uri_;
char *p = std::strtok(BeginPtr(uri), dlm);
while (p != NULL) {
out_fname->push_back(std::string(p));
p = std::strtok(NULL, dlm);
}
}
private:
/*!
* \brief initialize the line spliter,
* \param file_size, size of each files
* \param rank the current rank of the data
* \param nsplit number of split we will divide the data into
*/
inline void Init(const std::vector<size_t> &file_size,
unsigned rank, unsigned nsplit) {
file_offset_.resize(file_size.size() + 1);
file_offset_[0] = 0;
for (size_t i = 0; i < file_size.size(); ++i) {
file_offset_[i + 1] = file_offset_[i] + file_size[i];
}
size_t ntotal = file_offset_.back();
size_t nstep = (ntotal + nsplit - 1) / nsplit;
offset_begin_ = std::min(nstep * rank, ntotal);
offset_end_ = std::min(nstep * (rank + 1), ntotal);
offset_curr_ = offset_begin_;
if (offset_begin_ == offset_end_) return;
file_ptr_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_begin_) - file_offset_.begin() - 1;
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_end_) - file_offset_.begin() - 1;
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
// try to set the starting position correctly
if (file_offset_[file_ptr_] != offset_begin_) {
fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
while (true) {
char c = reader_.GetChar();
if (!reader_.AtEnd()) ++offset_curr_;
if (c == '\n' || c == '\r' || c == EOF) return;
}
}
}
private:
/*! \brief FileProvider */
IFileProvider *provider_;
/*! \brief current input stream */
utils::SeekStream *fs_;
/*! \brief file pointer of which file to read on */
size_t file_ptr_;
/*! \brief file pointer where the end of file lies */
size_t file_ptr_end_;
/*! \brief get the current offset */
size_t offset_curr_;
/*! \brief beginning of offset */
size_t offset_begin_;
/*! \brief end of the offset */
size_t offset_end_;
/*! \brief byte-offset of each file */
std::vector<size_t> file_offset_;
/*! \brief buffer reader */
StreamBufferReader reader_;
/*! \brief buffer size */
const static size_t kBufferSize = 256;
};
/*! \brief line split from single std::FILE */
class SingleFileSplit : public InputSplit {
public:
explicit SingleFileSplit(const char *fname) {
if (!std::strcmp(fname, "stdin")) {
#ifndef RABIT_STRICT_CXX98_
use_stdin_ = true; fp_ = stdin;
#endif
}
if (!use_stdin_) {
fp_ = utils::FopenCheck(fname, "r");
}
end_of_file_ = false;
}
virtual ~SingleFileSplit(void) {
if (!use_stdin_) std::fclose(fp_);
}
virtual bool ReadLine(std::string *out_data) {
if (end_of_file_) return false;
out_data->clear();
while (true) {
char c = std::fgetc(fp_);
if (c == EOF) {
end_of_file_ = true;
}
if (c != '\r' && c != '\n' && c != EOF) {
*out_data += c;
} else {
if (out_data->length() != 0) return true;
if (end_of_file_) return false;
}
}
return false;
}
private:
std::FILE *fp_;
bool use_stdin_;
bool end_of_file_;
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_LINE_SPLIT_INL_H_

View File

@ -1,2 +0,0 @@
kmeans
*.mpi

View File

@ -1,15 +0,0 @@
# specify tensor path
BIN = kmeans.rabit
MOCKBIN= kmeans.mock
MPIBIN = kmeans.mpi
# objectives that makes up rabit library
OBJ = kmeans.o
# common build script for programs
include ../make/common.mk
# dependenies here
kmeans.rabit: kmeans.o lib
kmeans.mock: kmeans.o lib
kmeans.mpi: kmeans.o libmpi
kmeans.o: kmeans.cc ../../src/*.h

View File

@ -1,129 +0,0 @@
Toolkit
====
This folder contains some example toolkits developed with rabit to help you get started.
KMeans
====
## Input File Format
KMeans uses LIBSVM format to parse the input. If you are not familiar with LIBSVM, <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">here</a> you will find more details.
The format is the following:
&lt;label&gt; &lt;index1&gt;:&lt;value1&gt; &lt;index2&gt;:&lt;value2&gt; ...
where label is a dummy integer value in this case (you can add 1's to every example), index&lt;x&gt; is the index for feature x, and value&lt;x&gt; is the feature x value.
## Output File Format
KMeans currently outputs the centroids as dense vectors. Each line in the output file corresponds to a centroid. The number of lines in the file must match the number of clusters K you specified in the command line.
## Example
Let's go over a more detailed example...
#### Preprocess
Download the smallwiki dataset used in the Machine Learning for Big Data class at University of Washington.
http://courses.cs.washington.edu/courses/cse547/14wi/datasets/smallwiki.zip
Unzip it, you should find three files:
* tfidf.txt: each row is in the form of “docid||termid1:tfidf1,termid2:tfidf2,...
* dictionary.txt: map of term to termid
* cluster0.txt: initial cluster centers. Won't needed.
The first thing to do is to convert the tfidf file format into the input format rabit supports, i.e. LIBSVM. For that, you can use a simple python script. The following should suffice. You should redirect the output to a file, let's say tfidf.libsvm.
```python
for line in open("tfidf.txt").read().splitlines():
example = line.split('|')[1].split(',')
example = ' '.join(example)
print '%s %s' % (1, example)
```
#### Compile
You will then need to build the KMeans program with ```make```, which will produce three binaries:
* kmeans.mpi: runs on MPI.
* kmeans.mock: uses a mock to simulate error conditions for testing purposes.
* kmeans.rabit: uses our C++ implementation.
#### Running with Hadoop
If you want to run it with Hadoop, you can execute the [./kmeans_hadoop.sh](./kmeans_hadoop.sh) script from your master node in cluster.
You will have to edit the file in order to specify the path to the Hadoop Streaming jar. Afterwards, you can execute it with the following arguments (in the exact same order):
* number of worker nodes in your Hadoop cluster (i.e. number of slave nodes)
* path to the input data (HDFS path where you put the preprocessed file in libsvm format)
* number of clusters K (let's use 20 for this example)
* number of iterations to perform (let's use just 5 iterations)
* output path (HDFS path where to store the output data, must be a non-existent folder)
The current implementation runs for the amount of iterations you specify in the command line argument. If you would like to add some convergence criteria (e.g. when no cluster assignment changes between iterations you stop or something like that) you will have to modify [./kmeans.cc](./kmeans.cc). We leave that as an exercise to the reader :)
You may have noticed that [./kmeans_hadoop.sh](./kmeans_hadoop.sh) uses kmeans.rabit binary, but you can also use kmeans.mock in order to easily test your system behavior in presence of failures. More on that later.
Don't forget to copy the preprocessed file into HDFS and create the output folder. For example, inside the bin folder in Hadoop, you can execute the following:
```bash
$ ./hadoop fs -mkdir kmeans
$ ./hadoop fs -mkdir kmeans/in
$ ./hadoop fs -put tfidf.libsvm kmeans/in
$ ./hadoop fs -mkdir kmeans/out
```
#### Running with MPI
You will need to have a MPI cluster installed, for example OpenMPI. In order to run the program, you can use mpirun to submit the job. This is a non-fault tolerant version as it is backed by MPI.
#### Running with Mock
As previously mentioned, you can execute the kmeans example, an any of your own, with the mock binary. This will allow you to test error conditions while you are developing your algorithms. As explained in the [Tutorial](../guide), passing the script certain parameters (e.g. mock=0,0,1,0) will cause certain node to exit after calling Allreduce/Broadcast in some iteration.
You can also run this locally, you will only need to split the input file into several smaller files, each will be used by a particular process in the shared memory environment. You can use some Unix command line tool such as split.
#### Processing Output
Once the program finishes running, you can fetch the output from HDFS. For example, inside the bin folder in Hadoop, you can execute the following:
```bash
$ ./hadoop fs -get kmeans/out/part-00000 kmeans.out
```
Each line of the output file is a centroid in dense format. As this dataset contains the words in dictionary.txt file, you can do some simple post processing to recover the top 10 words of each centroid. Something like this should work:
```python
words = {}
for line in open("dictionary.txt").read().splitlines():
word, index = line.split(' ')
words[int(index)] = word
from collections import defaultdict
clusters = defaultdict(list)
cluster_name = 0
for line in open("kmeans.out").read().splitlines():
line = line.split(' ')
clusters[cluster_name].extend(line)
cluster_name+=1
import numpy as np
for j, key in enumerate(clusters):
elements = clusters[key]
array = np.array(elements).astype(np.float32)
idx = np.argsort(array)[::-1][:10]
ws = []
for i in idx:
ws.append(words[i])
print 'cluster %d = %s' % (j, ' '.join(ws))
```

View File

@ -1,197 +0,0 @@
// this is a test case to test whether rabit can recover model when
// facing an exception
#include <rabit.h>
#include <rabit/utils.h>
#include <time.h>
#include "../utils/data.h"
using namespace rabit;
// simple dense matrix, mshadow or Eigen matrix was better
// this was was OK
struct Matrix {
inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
this->nrow = nrow;
this->ncol = ncol;
data.resize(nrow * ncol);
std::fill(data.begin(), data.end(), v);
}
inline float *operator[](size_t i) {
return &data[0] + i * ncol;
}
inline const float *operator[](size_t i) const {
return &data[0] + i * ncol;
}
inline void Print(utils::Stream *fo) {
for (size_t i = 0; i < data.size(); ++i) {
std::ostringstream ss;
ss << data[i];
if ((i+1) % ncol == 0) {
ss << '\n';
} else {
ss << ' ';
}
}
std::string s = ss.str();
}
// number of data
size_t nrow, ncol;
std::vector<float> data;
};
// kmeans model
class Model : public rabit::Serializable {
public:
// matrix of centroids
Matrix centroids;
// load from stream
virtual void Load(rabit::Stream *fi) {
fi->Read(&centroids.nrow, sizeof(centroids.nrow));
fi->Read(&centroids.ncol, sizeof(centroids.ncol));
fi->Read(&centroids.data);
}
/*! \brief save the model to the stream */
virtual void Save(rabit::Stream *fo) const {
fo->Write(&centroids.nrow, sizeof(centroids.nrow));
fo->Write(&centroids.ncol, sizeof(centroids.ncol));
fo->Write(centroids.data);
}
virtual void InitModel(unsigned num_cluster, unsigned feat_dim) {
centroids.Init(num_cluster, feat_dim);
}
// normalize L2 norm
inline void Normalize(void) {
for (size_t i = 0; i < centroids.nrow; ++i) {
float *row = centroids[i];
double wsum = 0.0;
for (size_t j = 0; j < centroids.ncol; ++j) {
wsum += row[j] * row[j];
}
wsum = sqrt(wsum);
if (wsum < 1e-6) return;
float winv = 1.0 / wsum;
for (size_t j = 0; j < centroids.ncol; ++j) {
row[j] *= winv;
}
}
}
};
inline void InitCentroids(const SparseMat &data, Matrix *centroids) {
int num_cluster = centroids->nrow;
for (int i = 0; i < num_cluster; ++i) {
int index = Random(data.NumRow());
SparseMat::Vector v = data[index];
for (unsigned j = 0; j < v.length; ++j) {
(*centroids)[i][v[j].findex] = v[j].fvalue;
}
}
for (int i = 0; i < num_cluster; ++i) {
int proc = Random(rabit::GetWorldSize());
rabit::Broadcast((*centroids)[i], centroids->ncol * sizeof(float), proc);
}
}
inline double Cos(const float *row,
const SparseMat::Vector &v) {
double rdot = 0.0, rnorm = 0.0;
for (unsigned i = 0; i < v.length; ++i) {
rdot += row[v[i].findex] * v[i].fvalue;
rnorm += v[i].fvalue * v[i].fvalue;
}
return rdot / sqrt(rnorm);
}
inline size_t GetCluster(const Matrix &centroids,
const SparseMat::Vector &v) {
size_t imin = 0;
double dmin = Cos(centroids[0], v);
for (size_t k = 1; k < centroids.nrow; ++k) {
double dist = Cos(centroids[k], v);
if (dist > dmin) {
dmin = dist; imin = k;
}
}
return imin;
}
int main(int argc, char *argv[]) {
if (argc < 5) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
}
rabit::Finalize();
return 0;
}
clock_t tStart = clock();
srand(0);
// load the data
SparseMat data;
data.Load(argv[1]);
// set the parameters
int num_cluster = atoi(argv[2]);
int max_iter = atoi(argv[3]);
// intialize rabit engine
rabit::Init(argc, argv);
// load model
Model model;
int iter = rabit::LoadCheckPoint(&model);
if (iter == 0) {
rabit::Allreduce<op::Max>(&data.feat_dim, 1);
model.InitModel(num_cluster, data.feat_dim);
InitCentroids(data, &model.centroids);
model.Normalize();
rabit::TrackerPrintf("[%d] start at %s\n",
rabit::GetRank(), rabit::GetProcessorName().c_str());
} else {
rabit::TrackerPrintf("[%d] restart iter=%d\n", rabit::GetRank(), iter);
}
const unsigned num_feat = data.feat_dim;
// matrix to store the result
Matrix temp;
for (int r = iter; r < max_iter; ++r) {
temp.Init(num_cluster, num_feat + 1, 0.0f);
#if __cplusplus >= 201103L
auto lazy_get_centroid = [&]()
#endif
{
// lambda function used to calculate the data if necessary
// this function may not be called when the result can be directly recovered
const size_t ndata = data.NumRow();
for (size_t i = 0; i < ndata; ++i) {
SparseMat::Vector v = data[i];
size_t k = GetCluster(model.centroids, v);
// temp[k] += v
for (size_t j = 0; j < v.length; ++j) {
temp[k][v[j].findex] += v[j].fvalue;
}
// use last column to record counts
temp[k][num_feat] += 1.0f;
}
};
// call allreduce
#if __cplusplus >= 201103L
rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size(), lazy_get_centroid);
#else
rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size());
#endif
// set number
for (int k = 0; k < num_cluster; ++k) {
float cnt = temp[k][num_feat];
utils::Check(cnt != 0.0f, "get zero sized cluster");
for (unsigned i = 0; i < num_feat; ++i) {
model.centroids[k][i] = temp[k][i] / cnt;
}
}
model.Normalize();
rabit::LazyCheckPoint(&model);
}
// output the model file to somewhere
if (rabit::GetRank() == 0) {
model.centroids.Print(argv[4]);
}
rabit::TrackerPrintf("[%d] Time taken: %f seconds\n", rabit::GetRank(), static_cast<float>(clock() - tStart) / CLOCKS_PER_SEC);
rabit::Finalize();
return 0;
}

View File

@ -1,9 +0,0 @@
#!/bin/bash
if [ "$#" -lt 5 ];
then
echo "Usage: <nslaves> <input_data> <ncluster> <max_iteration> <output>"
exit -1
fi
#set path to hadoop streaming jar here
STREAMING_JAR=
python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5 kmeans.rabit stdin $3 $4 stdout

View File

@ -1,2 +0,0 @@
mushroom.row*
*.model

View File

@ -1,21 +0,0 @@
ifneq ("$(wildcard ../config.mk)","")
config = ../config.mk
else
config = ../make/config.mk
endif
include $(config)
BIN = linear.rabit
MOCKBIN= linear.mock
MPIBIN =
# objectives that makes up rabit library
OBJ = linear.o
# common build script for programs
include ../make/common.mk
CFLAGS+=-fopenmp
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
# dependenies here
linear.rabit: linear.o lib
linear.mock: linear.o lib

View File

@ -1,48 +0,0 @@
Linear and Logistic Regression
====
* input format: LibSVM
* Local Example: [run-linear.sh](run-linear.sh)
* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
- You will need to have YARN
- Modify ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
- Run build.sh on [../../yarn](../../yarn) on to build yarn jar file
Multi-Threading Optimization
====
* The code can be multi-threaded, we encourage you to use it
- Simply add ```nthread=k``` where k is the number of threads you want to use
* If you submit with YARN
- Use ```--vcores``` and ```-mem``` to request CPU and memory resources
- Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
* Usually multi-threading improves speed in general
- You can use less workers and assign more resources to each of worker
- This usually means less communication overhead and faster running time
Parameters
====
All the parameters can be set by param=value
#### Important Parameters
* objective [default = logistic]
- can be linear or logistic
* base_score [default = 0.5]
- global bias, recommended set to mean value of label
* reg_L1 [default = 0]
- l1 regularization co-efficient
* reg_L2 [default = 1]
- l2 regularization co-efficient
* lbfgs_stop_tol [default = 1e-5]
- relative tolerance level of loss reduction with respect to initial loss
* max_lbfgs_iter [default = 500]
- maximum number of lbfgs iterations
### Optimization Related parameters
* min_lbfgs_iter [default = 5]
- minimum number of lbfgs iterations
* max_linesearch_iter [default = 100]
- maximum number of iterations in linesearch
* linesearch_c1 [default = 1e-4]
- c1 co-efficient in backoff linesearch
* linesarch_backoff [default = 0.5]
- backoff ratio in linesearch

View File

@ -1,227 +0,0 @@
#include "./linear.h"
#include "../io/io.h"
namespace rabit {
namespace linear {
class LinearObjFunction : public solver::IObjFunction<float> {
public:
// training threads
int nthread;
// L2 regularization
float reg_L2;
// model
LinearModel model;
// training data
SparseMat dtrain;
// solver
solver::LBFGSSolver<float> lbfgs;
// constructor
LinearObjFunction(void) {
lbfgs.SetObjFunction(this);
nthread = 1;
reg_L2 = 0.0f;
model.weight = NULL;
task = "train";
model_in = "NULL";
name_pred = "pred.txt";
model_out = "final.model";
}
virtual ~LinearObjFunction(void) {
}
// set parameters
inline void SetParam(const char *name, const char *val) {
model.param.SetParam(name, val);
lbfgs.SetParam(name, val);
if (!strcmp(name, "num_feature")) {
char ndigit[30];
sprintf(ndigit, "%lu", model.param.num_feature + 1);
lbfgs.SetParam("num_dim", ndigit);
}
if (!strcmp(name, "reg_L2")) {
reg_L2 = static_cast<float>(atof(val));
}
if (!strcmp(name, "nthread")) {
nthread = atoi(val);
}
if (!strcmp(name, "task")) task = val;
if (!strcmp(name, "model_in")) model_in = val;
if (!strcmp(name, "model_out")) model_out = val;
if (!strcmp(name, "name_pred")) name_pred = val;
}
inline void Run(void) {
if (model_in != "NULL") {
this->LoadModel(model_in.c_str());
}
if (task == "train") {
lbfgs.Run();
if (rabit::GetRank() == 0) {
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
}
} else if (task == "pred") {
this->TaskPred();
} else {
utils::Error("unknown task=%s", task.c_str());
}
}
inline void TaskPred(void) {
utils::Check(model_in != "NULL",
"must set model_in for task=pred");
FILE *fp = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float pred = model.Predict(dtrain[i]);
fprintf(fp, "%g\n", pred);
}
fclose(fp);
printf("Finishing writing to %s\n", name_pred.c_str());
}
inline void LoadModel(const char *fname) {
Stream *fi = io::CreateStream(fname, "r");
std::string header; header.resize(4);
// check header for different binary encode
// can be base64 or binary
utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
// base64 format
if (header == "bs64") {
io::Base64InStream bsin(fi);
bsin.InitPosition();
model.Load(&bsin);
} else if (header == "binf") {
model.Load(fi);
} else {
utils::Error("invalid model file");
}
delete fi;
}
inline void SaveModel(const char *fname,
const float *wptr,
bool save_base64 = false) {
Stream *fo = io::CreateStream(fname, "w");
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
fo->Write("bs64\t", 5);
io::Base64OutStream bout(fo);
model.Save(&bout, wptr);
bout.Finish('\n');
} else {
fo->Write("binf", 4);
model.Save(fo, wptr);
}
delete fo;
}
inline void LoadData(const char *fname) {
dtrain.Load(fname);
}
virtual size_t InitNumDim(void) {
if (model_in == "NULL") {
size_t ndim = dtrain.feat_dim;
rabit::Allreduce<rabit::op::Max>(&ndim, 1);
model.param.num_feature = std::max(ndim, model.param.num_feature);
}
return model.param.num_feature + 1;
}
virtual void InitModel(float *weight, size_t size) {
if (model_in == "NULL") {
memset(weight, 0.0f, size * sizeof(float));
model.param.InitBaseScore();
} else {
rabit::Broadcast(model.weight, size * sizeof(float), 0);
memcpy(weight, model.weight, size * sizeof(float));
}
}
// load model
virtual void Load(rabit::Stream *fi) {
fi->Read(&model.param, sizeof(model.param));
}
virtual void Save(rabit::Stream *fo) const {
fo->Write(&model.param, sizeof(model.param));
}
virtual double Eval(const float *weight, size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
double sum_val = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_val)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float py = model.param.PredictMargin(weight, dtrain[i]);
float fv = model.param.MarginToLoss(dtrain.labels[i], py);
sum_val += fv;
}
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
double sum_sqr = 0.0;
for (size_t i = 0; i < model.param.num_feature; ++i) {
sum_sqr += weight[i] * weight[i];
}
sum_val += 0.5 * reg_L2 * sum_sqr;
}
}
utils::Check(!std::isnan(sum_val), "nan occurs");
return sum_val;
}
virtual void CalcGrad(float *out_grad,
const float *weight,
size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
memset(out_grad, 0.0f, sizeof(float) * size);
double sum_gbias = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_gbias)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
SparseMat::Vector v = dtrain[i];
float py = model.param.Predict(weight, v);
float grad = model.param.PredToGrad(dtrain.labels[i], py);
for (index_t j = 0; j < v.length; ++j) {
out_grad[v[j].findex] += v[j].fvalue * grad;
}
sum_gbias += grad;
}
out_grad[model.param.num_feature] = static_cast<float>(sum_gbias);
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
for (size_t i = 0; i < model.param.num_feature; ++i) {
out_grad[i] += reg_L2 * weight[i];
}
}
}
}
private:
std::string task;
std::string model_in;
std::string model_out;
std::string name_pred;
};
} // namespace linear
} // namespace rabit
int main(int argc, char *argv[]) {
if (argc < 2) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_in> param=val\n");
}
rabit::Finalize();
return 0;
}
rabit::linear::LinearObjFunction *linear = new rabit::linear::LinearObjFunction();
if (!strcmp(argv[1], "stdin")) {
linear->LoadData(argv[1]);
rabit::Init(argc, argv);
} else {
rabit::Init(argc, argv);
linear->LoadData(argv[1]);
}
for (int i = 2; i < argc; ++i) {
char name[256], val[256];
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
linear->SetParam(name, val);
}
}
linear->Run();
delete linear;
rabit::Finalize();
return 0;
}

View File

@ -1,134 +0,0 @@
/*!
* Copyright (c) 2015 by Contributors
* \file linear.h
* \brief Linear and Logistic regression
*
* \author Tianqi Chen
*/
#ifndef RABIT_LINEAR_H_
#define RABIT_LINEAR_H_
#include <omp.h>
#include "../utils/data.h"
#include "../solver/lbfgs.h"
namespace rabit {
namespace linear {
/*! \brief simple linear model */
struct LinearModel {
struct ModelParam {
/*! \brief global bias */
float base_score;
/*! \brief number of features */
size_t num_feature;
/*! \brief loss type*/
int loss_type;
// reserved field
int reserved[16];
// constructor
ModelParam(void) {
memset(this, 0, sizeof(ModelParam));
base_score = 0.5f;
num_feature = 0;
loss_type = 1;
num_feature = 0;
}
// initialize base score
inline void InitBaseScore(void) {
utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss");
base_score = -std::log(1.0f / base_score - 1.0f);
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("base_score", name)) {
base_score = static_cast<float>(atof(val));
}
if (!strcmp("num_feature", name)) {
num_feature = static_cast<size_t>(atol(val));
}
if (!strcmp("objective", name)) {
if (!strcmp("linear", val)) {
loss_type = 0;
} else if (!strcmp("logistic", val)) {
loss_type = 1;
} else {
utils::Error("unknown objective type %s\n", val);
}
}
}
// transform margin to prediction
inline float MarginToPred(float margin) const {
if (loss_type == 1) {
return 1.0f / (1.0f + std::exp(-margin));
} else {
return margin;
}
}
// margin to loss
inline float MarginToLoss(float label, float margin) const {
if (loss_type == 1) {
float nlogprob;
if (margin > 0.0f) {
nlogprob = std::log(1.0f + std::exp(-margin));
} else {
nlogprob = -margin + std::log(1.0f + std::exp(margin));
}
return label * nlogprob +
(1.0f -label) * (margin + nlogprob);
} else {
float diff = margin - label;
return 0.5f * diff * diff;
}
}
inline float PredToGrad(float label, float pred) const {
return pred - label;
}
inline float PredictMargin(const float *weight,
const SparseMat::Vector &v) const {
// weight[num_feature] is bias
float sum = base_score + weight[num_feature];
for (unsigned i = 0; i < v.length; ++i) {
if (v[i].findex >= num_feature) continue;
sum += weight[v[i].findex] * v[i].fvalue;
}
return sum;
}
inline float Predict(const float *weight,
const SparseMat::Vector &v) const {
return MarginToPred(PredictMargin(weight, v));
}
};
// model parameter
ModelParam param;
// weight corresponding to the model
float *weight;
LinearModel(void) : weight(NULL) {
}
~LinearModel(void) {
if (weight != NULL) delete [] weight;
}
// load model
inline void Load(rabit::Stream *fi) {
fi->Read(&param, sizeof(param));
if (weight == NULL) {
weight = new float[param.num_feature + 1];
}
fi->Read(weight, sizeof(float) * (param.num_feature + 1));
}
inline void Save(rabit::Stream *fo, const float *wptr = NULL) {
fo->Write(&param, sizeof(param));
if (wptr == NULL) wptr = weight;
fo->Write(wptr, sizeof(float) * (param.num_feature + 1));
}
inline float Predict(const SparseMat::Vector &v) const {
return param.Predict(weight, v);
}
};
} // namespace linear
} // namespace rabit
#endif // RABIT_LINEAR_H_

View File

@ -1,20 +0,0 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
exit -1
fi
# put the local training file to HDFS
hadoop fs -rm -r -f $2/data
hadoop fs -rm -r -f $2/mushroom.linear.model
hadoop fs -mkdir $2/data
hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model

View File

@ -1,11 +0,0 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf *.model
k=$1
../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1

View File

@ -1,14 +0,0 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf *.model
k=$1
# run linear model, the program will automatically split the inputs
../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model

View File

@ -1,20 +0,0 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
exit -1
fi
# put the local training file to HDFS
hadoop fs -rm -r -f $2/mushroom.linear.model
hadoop fs -mkdir $2/data
hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop
../../wormhole/tracker/dmlc_yarn.py -n $1 --vcores 1 ./linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model ./linear.model
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model

View File

@ -1,49 +0,0 @@
# this is the common build script for rabit programs
# you do not have to use it
export LDFLAGS= -L../../lib -pthread -lm -lrt
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fPIC -I../../include
# setup opencv
ifeq ($(USE_DMLC),1)
include ../../dmlc-core/make/dmlc.mk
CFLAGS+= -DRABIT_USE_DMLC=1 -I ../../dmlc-core/include $(DMLC_CFLAGS)
LDFLAGS+= -L../../dmlc-core -ldmlc $(DMLC_LDFLAGS)
else
CFLAGS+= -DRABIT_USE_DMLC=0
endif
# setup opencv
ifeq ($(USE_HDFS),1)
CFLAGS+= -DRABIT_USE_HDFS=1 -I$(HADOOP_HDFS_HOME)/include -I$(JAVA_HOME)/include
LDFLAGS+= -L$(HADOOP_HDFS_HOME)/lib/native -L$(LIBJVM) -lhdfs -ljvm
else
CFLAGS+= -DRABIT_USE_HDFS=0
endif
.PHONY: clean all lib mpi
all: $(BIN) $(MOCKBIN)
mpi: $(MPIBIN)
lib:
cd ../..;make lib/librabit.a lib/librabit_mock.a; cd -
libmpi:
cd ../..;make lib/librabit_mpi.a;cd -
$(BIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit $(LDFLAGS)
$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit_mock $(LDFLAGS)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi
clean:
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~

View File

@ -1,24 +0,0 @@
#-----------------------------------------------------
# rabit-learn: the configuration compile script
#
# This is the default configuration setup for rabit-learn
# If you want to change configuration, do the following steps:
#
# - copy this file to the root of rabit-learn folder
# - modify the configuration you want
# - type make or make -j n on each of the folder
#----------------------------------------------------
# choice of compiler
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
# whether use HDFS support during compile
USE_HDFS = 1
# whether use dmlc's io utils
USE_DMLC = 0
# path to libjvm.so
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server

View File

@ -1,669 +0,0 @@
/*!
* Copyright (c) 2015 by Contributors
* \file lbfgs.h
* \brief L-BFGS solver for general optimization problem
*
* \author Tianqi Chen
*/
#ifndef RABIT_LEARN_LBFGS_H_
#define RABIT_LEARN_LBFGS_H_
#include <cmath>
#include <rabit.h>
namespace rabit {
/*! \brief namespace of solver for general problems */
namespace solver {
/*!
* \brief objective function for optimizers
* the objective function can also implement save/load
* to remember the state parameters that might need to remember
*/
template<typename DType>
class IObjFunction : public rabit::Serializable {
public:
// destructor
virtual ~IObjFunction(void){}
/*!
* \brief evaluate function values for a given weight
* \param weight weight of the function
* \param size size of the weight
*/
virtual double Eval(const DType *weight, size_t size) = 0;
/*!
* \return number of feature dimension to be allocated
* only called once during initialization
*/
virtual size_t InitNumDim(void) = 0;
/*!
* \brief initialize the weight before starting the solver
* only called once for initialization
*/
virtual void InitModel(DType *weight, size_t size) = 0;
/*!
* \brief calculate gradient for a given weight
* \param out_grad used to store the gradient value of the function
* \param weight weight of the function
* \param size size of the weight
*/
virtual void CalcGrad(DType *out_grad,
const DType *weight,
size_t size) = 0;
};
/*! \brief a basic version L-BFGS solver */
template<typename DType>
class LBFGSSolver {
public:
LBFGSSolver(void) {
// set default values
reg_L1 = 0.0f;
max_linesearch_iter = 100;
linesearch_backoff = 0.5f;
linesearch_c1 = 1e-4;
min_lbfgs_iter = 5;
max_lbfgs_iter = 500;
lbfgs_stop_tol = 1e-5f;
silent = 0;
}
virtual ~LBFGSSolver(void) {}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) {
if (!strcmp("num_dim", name)) {
gstate.num_dim = static_cast<size_t>(atol(val));
}
if (!strcmp("size_memory", name)) {
gstate.size_memory = static_cast<size_t>(atol(val));
}
if (!strcmp("reg_L1", name)) {
reg_L1 = static_cast<float>(atof(val));
}
if (!strcmp("lbfgs_stop_tol", name)) {
lbfgs_stop_tol = static_cast<float>(atof(val));
}
if (!strcmp("linesearch_backoff", name)) {
linesearch_backoff = static_cast<float>(atof(val));
}
if (!strcmp("max_linesearch_iter", name)) {
max_linesearch_iter = atoi(val);
}
if (!strcmp("max_lbfgs_iter", name)) {
max_lbfgs_iter = atoi(val);
}
if (!strcmp("min_lbfgs_iter", name)) {
min_lbfgs_iter = atoi(val);
}
if (!strcmp("linesearch_c1", name)) {
linesearch_c1 = static_cast<float>(atof(val));
}
}
/*!
* \brief set objective function to optimize
* the objective function only need to evaluate and calculate
* gradient with respect to current subset of data
* \param obj the objective function we are looking for
*/
virtual void SetObjFunction(IObjFunction<DType> *obj) {
gstate.obj = obj;
}
/*!
* \brief initialize the LBFGS solver
* user must already set the objective function
*/
virtual void Init(void) {
utils::Check(gstate.obj != NULL,
"LBFGSSolver.Init must SetObjFunction first");
int version = rabit::LoadCheckPoint(&gstate, &hist);
if (version == 0) {
gstate.num_dim = gstate.obj->InitNumDim();
} else {
printf("restart from version=%d\n", version);
}
{
// decide parameter partition
size_t nproc = rabit::GetWorldSize();
size_t rank = rabit::GetRank();
size_t step = (gstate.num_dim + nproc - 1) / nproc;
// upper align
step = (step + 7) / 8 * 8;
utils::Assert(step * nproc >= gstate.num_dim, "BUG");
range_begin_ = std::min(rank * step, gstate.num_dim);
range_end_ = std::min((rank + 1) * step, gstate.num_dim);
}
if (version == 0) {
gstate.Init();
hist.Init(range_end_ - range_begin_, gstate.size_memory);
gstate.obj->InitModel(gstate.weight, gstate.num_dim);
// broadcast initialize model
rabit::Broadcast(gstate.weight,
sizeof(DType) * gstate.num_dim, 0);
gstate.old_objval = this->Eval(gstate.weight);
gstate.init_objval = gstate.old_objval;
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu, RAM-approx=%lu\n",
gstate.num_dim, gstate.init_objval, gstate.size_memory,
gstate.MemCost() + hist.MemCost());
}
}
}
/*!
* \brief get the current weight vector
* note that if update function is called
* the content of weight vector is no longer valid
* \return weight vector
*/
virtual DType *GetWeight(void) {
return gstate.weight;
}
/*!
* \brief update the weight for one L-BFGS iteration
* \return whether stopping condition is met
*/
virtual bool UpdateOneIter(void) {
bool stop = false;
GlobalState &g = gstate;
g.obj->CalcGrad(g.grad, g.weight, g.num_dim);
rabit::Allreduce<rabit::op::Sum>(g.grad, g.num_dim);
// find change direction
double vdot = FindChangeDirection(g.tempw, g.grad, g.weight);
// line-search, g.grad is now new weight
int iter = BacktrackLineSearch(g.grad, g.tempw, g.weight, vdot);
utils::Check(iter < max_linesearch_iter, "line search failed");
// swap new weight
std::swap(g.weight, g.grad);
// check stop condition
if (gstate.num_iteration > static_cast<size_t>(min_lbfgs_iter)) {
if (g.old_objval - g.new_objval < lbfgs_stop_tol * g.init_objval) {
return true;
}
}
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("[%d] L-BFGS: linesearch finishes in %d rounds, new_objval=%g, improvment=%g\n",
gstate.num_iteration, iter,
gstate.new_objval,
gstate.old_objval - gstate.new_objval);
}
gstate.old_objval = gstate.new_objval;
rabit::CheckPoint(&gstate, &hist);
return stop;
}
/*! \brief run optimization */
virtual void Run(void) {
this->Init();
while (gstate.num_iteration < static_cast<size_t>(max_lbfgs_iter)) {
if (this->UpdateOneIter()) break;
}
if (silent == 0 && rabit::GetRank() == 0) {
size_t nonzero = 0;
for (size_t i = 0; i < gstate.num_dim; ++i) {
if (gstate.weight[i] != 0.0f) nonzero += 1;
}
rabit::TrackerPrintf
("L-BFGS: finishes at iteration %d, %lu/%lu active weights\n",
gstate.num_iteration, nonzero, gstate.num_dim);
}
}
protected:
// find the delta value, given gradient
// return dot(dir, l1grad)
virtual double FindChangeDirection(DType *dir,
const DType *grad,
const DType *weight) {
int m = static_cast<int>(gstate.size_memory);
int n = static_cast<int>(hist.num_useful());
if (n < m) {
utils::Assert(hist.num_useful() == gstate.num_iteration,
"BUG2, n=%d, it=%d", n, gstate.num_iteration);
} else {
utils::Assert(n == m, "BUG3");
}
const size_t num_dim = gstate.num_dim;
const DType *gsub = grad + range_begin_;
const size_t nsub = range_end_ - range_begin_;
double vdot = 0.0;
if (n != 0) {
// hist[m + n - 1] stores old gradient
Minus(hist[m + n - 1], gsub, hist[m + n - 1], nsub);
SetL1Dir(hist[2 * m], gsub, weight + range_begin_, nsub);
// index set for calculating results
std::vector<std::pair<size_t, size_t> > idxset;
for (int j = 0; j < n; ++j) {
idxset.push_back(std::make_pair(j, 2 * m));
idxset.push_back(std::make_pair(j, n - 1));
idxset.push_back(std::make_pair(j, m + n - 1));
}
for (int j = 0; j < n; ++j) {
idxset.push_back(std::make_pair(m + j, 2 * m));
idxset.push_back(std::make_pair(m + j, m + n - 1));
}
// calculate dot products
std::vector<double> tmp(idxset.size());
for (size_t i = 0; i < tmp.size(); ++i) {
tmp[i] = hist.CalcDot(idxset[i].first, idxset[i].second);
}
rabit::Allreduce<rabit::op::Sum>(BeginPtr(tmp), tmp.size());
for (size_t i = 0; i < tmp.size(); ++i) {
gstate.DotBuf(idxset[i].first, idxset[i].second) = tmp[i];
}
// BFGS steps, use vector-free update
// parameterize vector using basis in hist
std::vector<double> alpha(n);
std::vector<double> delta(2 * m + 1, 0.0);
delta[2 * m] = 1.0;
// backward step
for (int j = n - 1; j >= 0; --j) {
double vsum = 0.0;
for (size_t k = 0; k < delta.size(); ++k) {
vsum += delta[k] * gstate.DotBuf(k, j);
}
alpha[j] = vsum / gstate.DotBuf(j, m + j);
delta[m + j] = delta[m + j] - alpha[j];
}
// scale
double scale = gstate.DotBuf(n - 1, m + n - 1) /
gstate.DotBuf(m + n - 1, m + n - 1);
for (size_t k = 0; k < delta.size(); ++k) {
delta[k] *= scale;
}
// forward step
for (int j = 0; j < n; ++j) {
double vsum = 0.0;
for (size_t k = 0; k < delta.size(); ++k) {
vsum += delta[k] * gstate.DotBuf(k, m + j);
}
double beta = vsum / gstate.DotBuf(j, m + j);
delta[j] = delta[j] + (alpha[j] - beta);
}
// set all to zero
std::fill(dir, dir + num_dim, 0.0f);
DType *dirsub = dir + range_begin_;
for (int i = 0; i < n; ++i) {
AddScale(dirsub, dirsub, hist[m + i], delta[m + i], nsub);
}
AddScale(dirsub, dirsub, hist[2 * m], delta[2 * m], nsub);
for (int i = 0; i < n; ++i) {
AddScale(dirsub, dirsub, hist[i], delta[i], nsub);
}
FixDirL1Sign(dirsub, hist[2 * m], nsub);
vdot = -Dot(dirsub, hist[2 * m], nsub);
// allreduce to get full direction
rabit::Allreduce<rabit::op::Sum>(dir, num_dim);
rabit::Allreduce<rabit::op::Sum>(&vdot, 1);
} else {
SetL1Dir(dir, grad, weight, num_dim);
vdot = -Dot(dir, dir, num_dim);
}
// shift the history record
if (n < m) {
n += 1;
} else {
gstate.Shift(); hist.Shift();
}
hist.set_num_useful(n);
// copy gradient to hist[m + n - 1]
memcpy(hist[m + n - 1], gsub, nsub * sizeof(DType));
return vdot;
}
// line search for given direction
// return whether there is a descent
inline int BacktrackLineSearch(DType *new_weight,
const DType *dir,
const DType *weight,
double dot_dir_l1grad) {
utils::Assert(dot_dir_l1grad < 0.0f,
"gradient error, dotv=%g", dot_dir_l1grad);
double alpha = 1.0;
double backoff = linesearch_backoff;
// unit descent direction in first iter
if (gstate.num_iteration == 0) {
utils::Assert(hist.num_useful() == 1, "hist.nuseful");
alpha = 1.0f / std::sqrt(-dot_dir_l1grad);
backoff = 0.1f;
}
int iter = 0;
double old_val = gstate.old_objval;
double c1 = this->linesearch_c1;
while (true) {
const size_t num_dim = gstate.num_dim;
if (++iter >= max_linesearch_iter) return iter;
AddScale(new_weight, weight, dir, alpha, num_dim);
this->FixWeightL1Sign(new_weight, weight, num_dim);
double new_val = this->Eval(new_weight);
if (new_val - old_val <= c1 * dot_dir_l1grad * alpha) {
gstate.new_objval = new_val; break;
}
alpha *= backoff;
}
// hist[n - 1] = new_weight - weight
Minus(hist[hist.num_useful() - 1],
new_weight + range_begin_,
weight + range_begin_,
range_end_ - range_begin_);
gstate.num_iteration += 1;
return iter;
}
// OWL-QN step for L1 regularization
inline void SetL1Dir(DType *dst,
const DType *grad,
const DType *weight,
size_t size) {
if (reg_L1 == 0.0) {
for (size_t i = 0; i < size; ++i) {
dst[i] = -grad[i];
}
} else {
for (size_t i = 0; i < size; ++i) {
if (weight[i] > 0.0f) {
dst[i] = -grad[i] - reg_L1;
} else if (weight[i] < 0.0f) {
dst[i] = -grad[i] + reg_L1;
} else {
if (grad[i] < -reg_L1) {
dst[i] = -grad[i] - reg_L1;
} else if (grad[i] > reg_L1) {
dst[i] = -grad[i] + reg_L1;
} else {
dst[i] = 0.0;
}
}
}
}
}
// OWL-QN step: fix direction sign to be consistent with proposal
inline void FixDirL1Sign(DType *dir,
const DType *steepdir,
size_t size) {
if (reg_L1 != 0.0f) {
for (size_t i = 0; i < size; ++i) {
if (dir[i] * steepdir[i] <= 0.0f) {
dir[i] = 0.0f;
}
}
}
}
// QWL-QN step: fix direction sign to be consistent with proposal
inline void FixWeightL1Sign(DType *new_weight,
const DType *weight,
size_t size) {
if (reg_L1 != 0.0f) {
for (size_t i = 0; i < size; ++i) {
if (new_weight[i] * weight[i] < 0.0f) {
new_weight[i] = 0.0f;
}
}
}
}
inline double Eval(const DType *weight) {
double val = gstate.obj->Eval(weight, gstate.num_dim);
rabit::Allreduce<rabit::op::Sum>(&val, 1);
if (reg_L1 != 0.0f) {
double l1norm = 0.0;
for (size_t i = 0; i < gstate.num_dim; ++i) {
l1norm += std::abs(weight[i]);
}
val += l1norm * reg_L1;
}
return val;
}
private:
// helper functions
// dst = lhs + rhs * scale
inline static void AddScale(DType *dst,
const DType *lhs,
const DType *rhs,
DType scale,
size_t size) {
for (size_t i = 0; i < size; ++i) {
dst[i] = lhs[i] + rhs[i] * scale;
}
}
// dst = lhs - rhs
inline static void Minus(DType *dst,
const DType *lhs,
const DType *rhs,
size_t size) {
for (size_t i = 0; i < size; ++i) {
dst[i] = lhs[i] - rhs[i];
}
}
// return dot(lhs, rhs)
inline static double Dot(const DType *lhs,
const DType *rhs,
size_t size) {
double res = 0.0;
for (size_t i = 0; i < size; ++i) {
res += lhs[i] * rhs[i];
}
return res;
}
// map rolling array index
inline static size_t MapIndex(size_t i, size_t offset,
size_t size_memory) {
if (i == 2 * size_memory) return i;
if (i < size_memory) {
return (i + offset) % size_memory;
} else {
utils::Assert(i < 2 * size_memory,
"MapIndex: index exceed bound, i=%lu", i);
return (i + offset) % size_memory + size_memory;
}
}
// global solver state
struct GlobalState : public rabit::Serializable {
public:
// memory size of L-BFGS
size_t size_memory;
// number of iterations passed
size_t num_iteration;
// number of features in the solver
size_t num_dim;
// initialize objective value
double init_objval;
// history objective value
double old_objval;
// new objective value
double new_objval;
// objective function
IObjFunction<DType> *obj;
// temporal storage
DType *grad, *weight, *tempw;
// constructor
GlobalState(void)
: obj(NULL), grad(NULL),
weight(NULL), tempw(NULL) {
size_memory = 10;
num_iteration = 0;
num_dim = 0;
old_objval = 0.0;
offset_ = 0;
}
~GlobalState(void) {
if (grad != NULL) {
delete [] grad;
delete [] weight;
delete [] tempw;
}
}
// intilize the space of rolling array
inline void Init(void) {
size_t n = size_memory * 2 + 1;
data.resize(n * n, 0.0);
this->AllocSpace();
}
// memory cost
inline size_t MemCost(void) const {
return sizeof(DType) * 3 * num_dim;
}
inline double &DotBuf(size_t i, size_t j) {
if (i > j) std::swap(i, j);
return data[MapIndex(i, offset_, size_memory) * (size_memory * 2 + 1) +
MapIndex(j, offset_, size_memory)];
}
// load the shift array
virtual void Load(rabit::Stream *fi) {
fi->Read(&size_memory, sizeof(size_memory));
fi->Read(&num_iteration, sizeof(num_iteration));
fi->Read(&num_dim, sizeof(num_dim));
fi->Read(&init_objval, sizeof(init_objval));
fi->Read(&old_objval, sizeof(old_objval));
fi->Read(&offset_, sizeof(offset_));
fi->Read(&data);
this->AllocSpace();
fi->Read(weight, sizeof(DType) * num_dim);
obj->Load(fi);
}
// save the shift array
virtual void Save(rabit::Stream *fo) const {
fo->Write(&size_memory, sizeof(size_memory));
fo->Write(&num_iteration, sizeof(num_iteration));
fo->Write(&num_dim, sizeof(num_dim));
fo->Write(&init_objval, sizeof(init_objval));
fo->Write(&old_objval, sizeof(old_objval));
fo->Write(&offset_, sizeof(offset_));
fo->Write(data);
fo->Write(weight, sizeof(DType) * num_dim);
obj->Save(fo);
}
inline void Shift(void) {
offset_ = (offset_ + 1) % size_memory;
}
private:
// rolling offset in the current memory
size_t offset_;
std::vector<double> data;
// allocate sapce
inline void AllocSpace(void) {
if (grad == NULL) {
grad = new DType[num_dim];
weight = new DType[num_dim];
tempw = new DType[num_dim];
}
}
};
/*! \brief rolling array that carries history information */
struct HistoryArray : public rabit::Serializable {
public:
HistoryArray(void) : dptr_(NULL) {
num_useful_ = 0;
}
~HistoryArray(void) {
if (dptr_ != NULL) delete [] dptr_;
}
// intilize the space of rolling array
inline void Init(size_t num_col, size_t size_memory) {
if (dptr_ != NULL &&
(num_col_ != num_col || size_memory_ != size_memory)) {
delete dptr_;
}
num_col_ = num_col;
size_memory_ = size_memory;
stride_ = num_col_;
offset_ = 0;
size_t n = size_memory * 2 + 1;
dptr_ = new DType[n * stride_];
}
// memory cost
inline size_t MemCost(void) const {
return sizeof(DType) * (size_memory_ * 2 + 1) * stride_;
}
// fetch element from rolling array
inline const DType *operator[](size_t i) const {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
}
inline DType *operator[](size_t i) {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
}
// shift array: arr_old -> arr_new
// for i in [0, size_memory - 1), arr_new[i] = arr_old[i + 1]
// for i in [size_memory, 2 * size_memory - 1), arr_new[i] = arr_old[i + 1]
// arr_old[0] and arr_arr[size_memory] will be discarded
inline void Shift(void) {
offset_ = (offset_ + 1) % size_memory_;
}
inline double CalcDot(size_t i, size_t j) const {
return Dot((*this)[i], (*this)[j], num_col_);
}
// set number of useful memory
inline const size_t &num_useful(void) const {
return num_useful_;
}
// set number of useful memory
inline void set_num_useful(size_t num_useful) {
utils::Assert(num_useful <= size_memory_,
"num_useful exceed bound");
num_useful_ = num_useful;
}
// load the shift array
virtual void Load(rabit::Stream *fi) {
fi->Read(&num_col_, sizeof(num_col_));
fi->Read(&stride_, sizeof(stride_));
fi->Read(&size_memory_, sizeof(size_memory_));
fi->Read(&num_useful_, sizeof(num_useful_));
this->Init(num_col_, size_memory_);
for (size_t i = 0; i < num_useful_; ++i) {
fi->Read((*this)[i], num_col_ * sizeof(DType));
fi->Read((*this)[i + size_memory_], num_col_ * sizeof(DType));
}
}
// save the shift array
virtual void Save(rabit::Stream *fo) const {
fo->Write(&num_col_, sizeof(num_col_));
fo->Write(&stride_, sizeof(stride_));
fo->Write(&size_memory_, sizeof(size_memory_));
fo->Write(&num_useful_, sizeof(num_useful_));
for (size_t i = 0; i < num_useful_; ++i) {
fo->Write((*this)[i], num_col_ * sizeof(DType));
fo->Write((*this)[i + size_memory_], num_col_ * sizeof(DType));
}
}
private:
// number of columns in each of array
size_t num_col_;
// stride for each of column for alignment
size_t stride_;
// memory size of L-BFGS
size_t size_memory_;
// number of useful memory that will be used
size_t num_useful_;
// rolling offset in the current memory
size_t offset_;
// data pointer
DType *dptr_;
};
// data structure for LBFGS
GlobalState gstate;
HistoryArray hist;
// silent
int silent;
// the subrange of current node
size_t range_begin_;
size_t range_end_;
// L1 regularization co-efficient
float reg_L1;
// c1 ratio for line search
float linesearch_c1;
float linesearch_backoff;
int max_linesearch_iter;
int max_lbfgs_iter;
int min_lbfgs_iter;
float lbfgs_stop_tol;
};
} // namespace solver
} // namespace rabit
#endif // RABIT_LEARN_LBFGS_H_

View File

@ -1,101 +0,0 @@
/*!
* Copyright (c) 2015 by Contributors
* \file data.h
* \brief simple data structure that could be used by model
*
* \author Tianqi Chen
*/
#ifndef RABIT_LEARN_DATA_H_
#define RABIT_LEARN_DATA_H_
#include <vector>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <limits>
#include <cmath>
#include <sstream>
#include <rabit.h>
#include "../io/io.h"
namespace rabit {
// typedef index type
typedef unsigned index_t;
/*! \brief sparse matrix, CSR format */
struct SparseMat {
// sparse matrix entry
struct Entry {
// feature index
index_t findex;
// feature value
float fvalue;
};
// sparse vector
struct Vector {
const Entry *data;
index_t length;
inline const Entry &operator[](size_t i) const {
return data[i];
}
};
inline Vector operator[](size_t i) const {
Vector v;
v.data = &data[0] + row_ptr[i];
v.length = static_cast<index_t>(row_ptr[i + 1]-row_ptr[i]);
return v;
}
// load data from LibSVM format
inline void Load(const char *fname) {
io::InputSplit *in =
io::CreateInputSplit
(fname, rabit::GetRank(),
rabit::GetWorldSize());
row_ptr.clear();
row_ptr.push_back(0);
data.clear();
feat_dim = 0;
std::string line;
while (in->ReadLine(&line)) {
float label;
std::istringstream ss(line);
ss >> label;
Entry e;
unsigned long fidx;
while (!ss.eof()) {
if (!(ss >> fidx)) break;
ss.ignore(32, ':');
if (!(ss >> e.fvalue)) break;
e.findex = static_cast<index_t>(fidx);
data.push_back(e);
feat_dim = std::max(fidx, feat_dim);
}
labels.push_back(label);
row_ptr.push_back(data.size());
}
delete in;
feat_dim += 1;
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
"feature dimension exceed limit of index_t"\
"consider change the index_t to unsigned long");
}
inline size_t NumRow(void) const {
return row_ptr.size() - 1;
}
// memory cost
inline size_t MemCost(void) const {
return data.size() * sizeof(Entry);
}
// maximum feature dimension
size_t feat_dim;
std::vector<size_t> row_ptr;
std::vector<Entry> data;
std::vector<float> labels;
};
/*!\brief computes a random number modulo the value */
inline int Random(int value) {
return rand() % value;
}
} // namespace rabit
#endif // RABIT_LEARN_DATA_H_