Merge commit 'a30045c7cc54344e2084fb1fa3e01bfafc737188'
This commit is contained in:
commit
f55f8f023f
@ -5,7 +5,8 @@ rabit is a light weight library that provides a fault tolerant interface of Allr
|
||||
* [Tutorial](guide)
|
||||
* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
|
||||
* You can also directly read the [interface header](include/rabit.h)
|
||||
* [Machine Learning Tools](rabit-learn)
|
||||
* [Distributed Machine Learning Tools](https://github.com/dmlc/wormhole)
|
||||
- Rabit is one of the backbone library to support wormhole machine learning tools
|
||||
|
||||
Features
|
||||
====
|
||||
@ -33,5 +34,4 @@ Contributing
|
||||
Rabit is an open-source library, contributions are welcomed, including:
|
||||
* The rabit core library.
|
||||
* Customized tracker script for new platforms and interface of new languages.
|
||||
* Toolkits, benchmarks, resource (links to related repos).
|
||||
* Tutorial and examples about the library.
|
||||
|
||||
@ -151,7 +151,7 @@ This section trys to gives examples of different aspectes of rabit API.
|
||||
|
||||
#### Structure of a Rabit Program
|
||||
The following code illustrates the common structure of a rabit program. This is an abstract example,
|
||||
you can also refer to [kmeans.cc](../rabit-learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
|
||||
you can also refer to [wormhole](https://github.com/dmlc/wormhole/blob/master/learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
|
||||
|
||||
```c++
|
||||
#include <rabit.h>
|
||||
|
||||
@ -8,6 +8,10 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <istream>
|
||||
#include <ostream>
|
||||
#include <streambuf>
|
||||
#include <cassert>
|
||||
|
||||
/*! \brief namespace for dmlc */
|
||||
namespace dmlc {
|
||||
@ -38,7 +42,6 @@ class Stream {
|
||||
* \param uri the uri of the input currently we support
|
||||
* hdfs://, s3://, and file:// by default file:// will be used
|
||||
* \param flag can be "w", "r", "a"
|
||||
* \return a created stream
|
||||
*/
|
||||
static Stream *Create(const char *uri, const char* const flag);
|
||||
// helper functions to write/read different data structures
|
||||
@ -103,11 +106,14 @@ class Serializable {
|
||||
class InputSplit {
|
||||
public:
|
||||
/*!
|
||||
* \brief read next line, store into out_data
|
||||
* \brief read next record, store into out_data
|
||||
* the data in outcomming record depends on the input data format
|
||||
* if input is text data, each line is returned as a record (\n not included)
|
||||
* if input is recordio, each record is returned
|
||||
* \param out_data the string that stores the line data, \n is not included
|
||||
* \return true of next line was found, false if we read all the lines
|
||||
*/
|
||||
virtual bool ReadLine(std::string *out_data) = 0;
|
||||
virtual bool ReadRecord(std::string *out_data) = 0;
|
||||
/*! \brief destructor*/
|
||||
virtual ~InputSplit(void) {}
|
||||
/*!
|
||||
@ -116,13 +122,133 @@ class InputSplit {
|
||||
* \param uri the uri of the input, can contain hdfs prefix
|
||||
* \param part_index the part id of current input
|
||||
* \param num_parts total number of splits
|
||||
* \return a created input split
|
||||
*/
|
||||
static InputSplit* Create(const char *uri,
|
||||
unsigned part_index,
|
||||
unsigned num_parts);
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief a std::ostream class that can can wrap Stream objects,
|
||||
* can use ostream with that output to underlying Stream
|
||||
*
|
||||
* Usage example:
|
||||
* \code
|
||||
*
|
||||
* Stream *fs = Stream::Create("hdfs:///test.txt", "w");
|
||||
* dmlc::ostream os(fs);
|
||||
* os << "hello world" << std::endl;
|
||||
* delete fs;
|
||||
* \endcode
|
||||
*/
|
||||
class ostream : public std::basic_ostream<char> {
|
||||
public:
|
||||
/*!
|
||||
* \brief construct std::ostream type
|
||||
* \param stream the Stream output to be used
|
||||
* \param buffer_size internal streambuf size
|
||||
*/
|
||||
explicit ostream(Stream *stream,
|
||||
size_t buffer_size = 1 << 10)
|
||||
: basic_ostream<char>(NULL), buf_(buffer_size) {
|
||||
this->set_stream(stream);
|
||||
}
|
||||
// explictly synchronize the buffer
|
||||
virtual ~ostream() {
|
||||
buf_.pubsync();
|
||||
}
|
||||
/*!
|
||||
* \brief set internal stream to be stream, reset states
|
||||
* \param stream new stream as output
|
||||
*/
|
||||
inline void set_stream(Stream *stream) {
|
||||
buf_.set_stream(stream);
|
||||
this->rdbuf(&buf_);
|
||||
}
|
||||
|
||||
private:
|
||||
// internal streambuf
|
||||
class OutBuf : public std::streambuf {
|
||||
public:
|
||||
explicit OutBuf(size_t buffer_size)
|
||||
: stream_(NULL), buffer_(buffer_size) {
|
||||
assert(buffer_.size() > 0);
|
||||
}
|
||||
// set stream to the buffer
|
||||
inline void set_stream(Stream *stream);
|
||||
|
||||
private:
|
||||
/*! \brief internal stream by StreamBuf */
|
||||
Stream *stream_;
|
||||
/*! \brief internal buffer */
|
||||
std::vector<char> buffer_;
|
||||
// override sync
|
||||
inline int_type sync(void);
|
||||
// override overflow
|
||||
inline int_type overflow(int c);
|
||||
};
|
||||
/*! \brief buffer of the stream */
|
||||
OutBuf buf_;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief a std::istream class that can can wrap Stream objects,
|
||||
* can use istream with that output to underlying Stream
|
||||
*
|
||||
* Usage example:
|
||||
* \code
|
||||
*
|
||||
* Stream *fs = Stream::Create("hdfs:///test.txt", "r");
|
||||
* dmlc::istream is(fs);
|
||||
* is >> mydata;
|
||||
* delete fs;
|
||||
* \endcode
|
||||
*/
|
||||
class istream : public std::basic_istream<char> {
|
||||
public:
|
||||
/*!
|
||||
* \brief construct std::ostream type
|
||||
* \param stream the Stream output to be used
|
||||
* \param buffer_size internal buffer size
|
||||
*/
|
||||
explicit istream(Stream *stream,
|
||||
size_t buffer_size = 1 << 10)
|
||||
: basic_istream<char>(NULL), buf_(buffer_size) {
|
||||
this->set_stream(stream);
|
||||
}
|
||||
virtual ~istream() {}
|
||||
/*!
|
||||
* \brief set internal stream to be stream, reset states
|
||||
* \param stream new stream as output
|
||||
*/
|
||||
inline void set_stream(Stream *stream) {
|
||||
buf_.set_stream(stream);
|
||||
this->rdbuf(&buf_);
|
||||
}
|
||||
|
||||
private:
|
||||
// internal streambuf
|
||||
class InBuf : public std::streambuf {
|
||||
public:
|
||||
explicit InBuf(size_t buffer_size)
|
||||
: stream_(NULL), buffer_(buffer_size) {
|
||||
assert(buffer_.size() > 0);
|
||||
}
|
||||
// set stream to the buffer
|
||||
inline void set_stream(Stream *stream);
|
||||
|
||||
private:
|
||||
/*! \brief internal stream by StreamBuf */
|
||||
Stream *stream_;
|
||||
/*! \brief internal buffer */
|
||||
std::vector<char> buffer_;
|
||||
// override underflow
|
||||
inline int_type underflow();
|
||||
};
|
||||
/*! \brief input buffer */
|
||||
InBuf buf_;
|
||||
};
|
||||
|
||||
// implementations of inline functions
|
||||
template<typename T>
|
||||
inline void Stream::Write(const std::vector<T> &vec) {
|
||||
@ -160,5 +286,48 @@ inline bool Stream::Read(std::string *out_str) {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// implementations for ostream
|
||||
inline void ostream::OutBuf::set_stream(Stream *stream) {
|
||||
if (stream_ != NULL) this->pubsync();
|
||||
this->stream_ = stream;
|
||||
this->setp(&buffer_[0], &buffer_[0] + buffer_.size() - 1);
|
||||
}
|
||||
inline int ostream::OutBuf::sync(void) {
|
||||
if (stream_ == NULL) return -1;
|
||||
std::ptrdiff_t n = pptr() - pbase();
|
||||
stream_->Write(pbase(), n);
|
||||
this->pbump(-n);
|
||||
return 0;
|
||||
}
|
||||
inline int ostream::OutBuf::overflow(int c) {
|
||||
*(this->pptr()) = c;
|
||||
std::ptrdiff_t n = pptr() - pbase();
|
||||
this->pbump(-n);
|
||||
if (c == EOF) {
|
||||
stream_->Write(pbase(), n);
|
||||
} else {
|
||||
stream_->Write(pbase(), n + 1);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
// implementations for istream
|
||||
inline void istream::InBuf::set_stream(Stream *stream) {
|
||||
stream_ = stream;
|
||||
this->setg(&buffer_[0], &buffer_[0], &buffer_[0]);
|
||||
}
|
||||
inline int istream::InBuf::underflow() {
|
||||
char *bhead = &buffer_[0];
|
||||
if (this->gptr() == this->egptr()) {
|
||||
size_t sz = stream_->Read(bhead, buffer_.size());
|
||||
this->setg(bhead, bhead, bhead + sz);
|
||||
}
|
||||
if (this->gptr() == this->egptr()) {
|
||||
return traits_type::eof();
|
||||
} else {
|
||||
return traits_type::to_int_type(*gptr());
|
||||
}
|
||||
}
|
||||
} // namespace dmlc
|
||||
#endif // DMLC_IO_H_
|
||||
|
||||
2
subtree/rabit/rabit-learn/.gitignore
vendored
2
subtree/rabit/rabit-learn/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
config.mk
|
||||
*.log
|
||||
@ -1,17 +0,0 @@
|
||||
Rabit-Learn
|
||||
====
|
||||
This folder contains implementation of distributed machine learning algorithm using rabit.
|
||||
It also contain links to the Machine Learning packages that uses rabit.
|
||||
|
||||
* Contribution of toolkits, examples, benchmarks is more than welcomed!
|
||||
|
||||
|
||||
Toolkits
|
||||
====
|
||||
* [KMeans Clustering](kmeans)
|
||||
* [Linear and Logistic Regression](linear)
|
||||
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
|
||||
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
|
||||
10 times faster than existing packages
|
||||
- Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
|
||||
single node version, and scale it to even larger problems
|
||||
@ -1,2 +0,0 @@
|
||||
This folder contains processed example dataset used by the demos.
|
||||
Copyright of the dataset belongs to the original copyright holder
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,126 +0,0 @@
|
||||
0 cap-shape=bell i
|
||||
1 cap-shape=conical i
|
||||
2 cap-shape=convex i
|
||||
3 cap-shape=flat i
|
||||
4 cap-shape=knobbed i
|
||||
5 cap-shape=sunken i
|
||||
6 cap-surface=fibrous i
|
||||
7 cap-surface=grooves i
|
||||
8 cap-surface=scaly i
|
||||
9 cap-surface=smooth i
|
||||
10 cap-color=brown i
|
||||
11 cap-color=buff i
|
||||
12 cap-color=cinnamon i
|
||||
13 cap-color=gray i
|
||||
14 cap-color=green i
|
||||
15 cap-color=pink i
|
||||
16 cap-color=purple i
|
||||
17 cap-color=red i
|
||||
18 cap-color=white i
|
||||
19 cap-color=yellow i
|
||||
20 bruises?=bruises i
|
||||
21 bruises?=no i
|
||||
22 odor=almond i
|
||||
23 odor=anise i
|
||||
24 odor=creosote i
|
||||
25 odor=fishy i
|
||||
26 odor=foul i
|
||||
27 odor=musty i
|
||||
28 odor=none i
|
||||
29 odor=pungent i
|
||||
30 odor=spicy i
|
||||
31 gill-attachment=attached i
|
||||
32 gill-attachment=descending i
|
||||
33 gill-attachment=free i
|
||||
34 gill-attachment=notched i
|
||||
35 gill-spacing=close i
|
||||
36 gill-spacing=crowded i
|
||||
37 gill-spacing=distant i
|
||||
38 gill-size=broad i
|
||||
39 gill-size=narrow i
|
||||
40 gill-color=black i
|
||||
41 gill-color=brown i
|
||||
42 gill-color=buff i
|
||||
43 gill-color=chocolate i
|
||||
44 gill-color=gray i
|
||||
45 gill-color=green i
|
||||
46 gill-color=orange i
|
||||
47 gill-color=pink i
|
||||
48 gill-color=purple i
|
||||
49 gill-color=red i
|
||||
50 gill-color=white i
|
||||
51 gill-color=yellow i
|
||||
52 stalk-shape=enlarging i
|
||||
53 stalk-shape=tapering i
|
||||
54 stalk-root=bulbous i
|
||||
55 stalk-root=club i
|
||||
56 stalk-root=cup i
|
||||
57 stalk-root=equal i
|
||||
58 stalk-root=rhizomorphs i
|
||||
59 stalk-root=rooted i
|
||||
60 stalk-root=missing i
|
||||
61 stalk-surface-above-ring=fibrous i
|
||||
62 stalk-surface-above-ring=scaly i
|
||||
63 stalk-surface-above-ring=silky i
|
||||
64 stalk-surface-above-ring=smooth i
|
||||
65 stalk-surface-below-ring=fibrous i
|
||||
66 stalk-surface-below-ring=scaly i
|
||||
67 stalk-surface-below-ring=silky i
|
||||
68 stalk-surface-below-ring=smooth i
|
||||
69 stalk-color-above-ring=brown i
|
||||
70 stalk-color-above-ring=buff i
|
||||
71 stalk-color-above-ring=cinnamon i
|
||||
72 stalk-color-above-ring=gray i
|
||||
73 stalk-color-above-ring=orange i
|
||||
74 stalk-color-above-ring=pink i
|
||||
75 stalk-color-above-ring=red i
|
||||
76 stalk-color-above-ring=white i
|
||||
77 stalk-color-above-ring=yellow i
|
||||
78 stalk-color-below-ring=brown i
|
||||
79 stalk-color-below-ring=buff i
|
||||
80 stalk-color-below-ring=cinnamon i
|
||||
81 stalk-color-below-ring=gray i
|
||||
82 stalk-color-below-ring=orange i
|
||||
83 stalk-color-below-ring=pink i
|
||||
84 stalk-color-below-ring=red i
|
||||
85 stalk-color-below-ring=white i
|
||||
86 stalk-color-below-ring=yellow i
|
||||
87 veil-type=partial i
|
||||
88 veil-type=universal i
|
||||
89 veil-color=brown i
|
||||
90 veil-color=orange i
|
||||
91 veil-color=white i
|
||||
92 veil-color=yellow i
|
||||
93 ring-number=none i
|
||||
94 ring-number=one i
|
||||
95 ring-number=two i
|
||||
96 ring-type=cobwebby i
|
||||
97 ring-type=evanescent i
|
||||
98 ring-type=flaring i
|
||||
99 ring-type=large i
|
||||
100 ring-type=none i
|
||||
101 ring-type=pendant i
|
||||
102 ring-type=sheathing i
|
||||
103 ring-type=zone i
|
||||
104 spore-print-color=black i
|
||||
105 spore-print-color=brown i
|
||||
106 spore-print-color=buff i
|
||||
107 spore-print-color=chocolate i
|
||||
108 spore-print-color=green i
|
||||
109 spore-print-color=orange i
|
||||
110 spore-print-color=purple i
|
||||
111 spore-print-color=white i
|
||||
112 spore-print-color=yellow i
|
||||
113 population=abundant i
|
||||
114 population=clustered i
|
||||
115 population=numerous i
|
||||
116 population=scattered i
|
||||
117 population=several i
|
||||
118 population=solitary i
|
||||
119 habitat=grasses i
|
||||
120 habitat=leaves i
|
||||
121 habitat=meadows i
|
||||
122 habitat=paths i
|
||||
123 habitat=urban i
|
||||
124 habitat=waste i
|
||||
125 habitat=woods i
|
||||
@ -1,218 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_BASE64_INL_H_
|
||||
#define RABIT_LEARN_IO_BASE64_INL_H_
|
||||
/*!
|
||||
* \file base64.h
|
||||
* \brief data stream support to input and output from/to base64 stream
|
||||
* base64 is easier to store and pass as text format in mapreduce
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <cctype>
|
||||
#include <cstdio>
|
||||
#include "./io.h"
|
||||
#include "./buffer_reader-inl.h"
|
||||
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
/*! \brief namespace of base64 decoding and encoding table */
|
||||
namespace base64 {
|
||||
const char DecodeTable[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
62, // '+'
|
||||
0, 0, 0,
|
||||
63, // '/'
|
||||
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
||||
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
|
||||
0, 0, 0, 0, 0, 0,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
|
||||
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
|
||||
};
|
||||
static const char EncodeTable[] =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
} // namespace base64
|
||||
/*! \brief the stream that reads from base64, note we take from file pointers */
|
||||
class Base64InStream: public Stream {
|
||||
public:
|
||||
explicit Base64InStream(Stream *fs) : reader_(256) {
|
||||
reader_.set_stream(fs);
|
||||
num_prev = 0; tmp_ch = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the stream position to beginning of next base64 stream
|
||||
* call this function before actually start read
|
||||
*/
|
||||
inline void InitPosition(void) {
|
||||
// get a charater
|
||||
do {
|
||||
tmp_ch = reader_.GetChar();
|
||||
} while (isspace(tmp_ch));
|
||||
}
|
||||
/*! \brief whether current position is end of a base64 stream */
|
||||
inline bool IsEOF(void) const {
|
||||
return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
using base64::DecodeTable;
|
||||
if (size == 0) return 0;
|
||||
// use tlen to record left size
|
||||
size_t tlen = size;
|
||||
unsigned char *cptr = static_cast<unsigned char*>(ptr);
|
||||
// if anything left, load from previous buffered result
|
||||
if (num_prev != 0) {
|
||||
if (num_prev == 2) {
|
||||
if (tlen >= 2) {
|
||||
*cptr++ = buf_prev[0];
|
||||
*cptr++ = buf_prev[1];
|
||||
tlen -= 2;
|
||||
num_prev = 0;
|
||||
} else {
|
||||
// assert tlen == 1
|
||||
*cptr++ = buf_prev[0]; --tlen;
|
||||
buf_prev[0] = buf_prev[1];
|
||||
num_prev = 1;
|
||||
}
|
||||
} else {
|
||||
// assert num_prev == 1
|
||||
*cptr++ = buf_prev[0]; --tlen; num_prev = 0;
|
||||
}
|
||||
}
|
||||
if (tlen == 0) return size;
|
||||
int nvalue;
|
||||
// note: everything goes with 4 bytes in Base64
|
||||
// so we process 4 bytes a unit
|
||||
while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
|
||||
// first byte
|
||||
nvalue = DecodeTable[tmp_ch] << 18;
|
||||
{
|
||||
// second byte
|
||||
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
||||
"invalid base64 format");
|
||||
nvalue |= DecodeTable[tmp_ch] << 12;
|
||||
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
|
||||
}
|
||||
{
|
||||
// third byte
|
||||
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
||||
"invalid base64 format");
|
||||
// handle termination
|
||||
if (tmp_ch == '=') {
|
||||
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
|
||||
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
|
||||
"invalid base64 format");
|
||||
break;
|
||||
}
|
||||
nvalue |= DecodeTable[tmp_ch] << 6;
|
||||
if (tlen) {
|
||||
*cptr++ = (nvalue >> 8) & 0xFF; --tlen;
|
||||
} else {
|
||||
buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
|
||||
}
|
||||
}
|
||||
{
|
||||
// fourth byte
|
||||
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
||||
"invalid base64 format");
|
||||
if (tmp_ch == '=') {
|
||||
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
|
||||
"invalid base64 format");
|
||||
break;
|
||||
}
|
||||
nvalue |= DecodeTable[tmp_ch];
|
||||
if (tlen) {
|
||||
*cptr++ = nvalue & 0xFF; --tlen;
|
||||
} else {
|
||||
buf_prev[num_prev ++] = nvalue & 0xFF;
|
||||
}
|
||||
}
|
||||
// get next char
|
||||
tmp_ch = reader_.GetChar();
|
||||
}
|
||||
if (kStrictCheck) {
|
||||
utils::Check(tlen == 0, "Base64InStream: read incomplete");
|
||||
}
|
||||
return size - tlen;
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
utils::Error("Base64InStream do not support write");
|
||||
}
|
||||
|
||||
private:
|
||||
StreamBufferReader reader_;
|
||||
int tmp_ch;
|
||||
int num_prev;
|
||||
unsigned char buf_prev[2];
|
||||
// whether we need to do strict check
|
||||
static const bool kStrictCheck = false;
|
||||
};
|
||||
/*! \brief the stream that write to base64, note we take from file pointers */
|
||||
class Base64OutStream: public Stream {
|
||||
public:
|
||||
explicit Base64OutStream(Stream *fp) : fp(fp) {
|
||||
buf_top = 0;
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
using base64::EncodeTable;
|
||||
size_t tlen = size;
|
||||
const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
|
||||
while (tlen) {
|
||||
while (buf_top < 3 && tlen != 0) {
|
||||
buf[++buf_top] = *cptr++; --tlen;
|
||||
}
|
||||
if (buf_top == 3) {
|
||||
// flush 4 bytes out
|
||||
PutChar(EncodeTable[buf[1] >> 2]);
|
||||
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
|
||||
PutChar(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F]);
|
||||
PutChar(EncodeTable[buf[3] & 0x3F]);
|
||||
buf_top = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
utils::Error("Base64OutStream do not support read");
|
||||
return 0;
|
||||
}
|
||||
/*!
|
||||
* \brief finish writing of all current base64 stream, do some post processing
|
||||
* \param endch charater to put to end of stream, if it is EOF, then nothing will be done
|
||||
*/
|
||||
inline void Finish(char endch = EOF) {
|
||||
using base64::EncodeTable;
|
||||
if (buf_top == 1) {
|
||||
PutChar(EncodeTable[buf[1] >> 2]);
|
||||
PutChar(EncodeTable[(buf[1] << 4) & 0x3F]);
|
||||
PutChar('=');
|
||||
PutChar('=');
|
||||
}
|
||||
if (buf_top == 2) {
|
||||
PutChar(EncodeTable[buf[1] >> 2]);
|
||||
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
|
||||
PutChar(EncodeTable[(buf[2] << 2) & 0x3F]);
|
||||
PutChar('=');
|
||||
}
|
||||
buf_top = 0;
|
||||
if (endch != EOF) PutChar(endch);
|
||||
this->Flush();
|
||||
}
|
||||
|
||||
private:
|
||||
Stream *fp;
|
||||
int buf_top;
|
||||
unsigned char buf[4];
|
||||
std::string out_buf;
|
||||
const static size_t kBufferSize = 256;
|
||||
|
||||
inline void PutChar(char ch) {
|
||||
out_buf += ch;
|
||||
if (out_buf.length() >= kBufferSize) Flush();
|
||||
}
|
||||
inline void Flush(void) {
|
||||
fp->Write(BeginPtr(out_buf), out_buf.length());
|
||||
out_buf.clear();
|
||||
}
|
||||
};
|
||||
} // namespace utils
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_UTILS_BASE64_INL_H_
|
||||
@ -1,58 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_BUFFER_READER_INL_H_
|
||||
#define RABIT_LEARN_IO_BUFFER_READER_INL_H_
|
||||
/*!
|
||||
* \file buffer_reader-inl.h
|
||||
* \brief implementation of stream buffer reader
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "./io.h"
|
||||
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
/*! \brief buffer reader of the stream that allows you to get */
|
||||
class StreamBufferReader {
|
||||
public:
|
||||
StreamBufferReader(size_t buffer_size)
|
||||
:stream_(NULL),
|
||||
read_len_(1), read_ptr_(1) {
|
||||
buffer_.resize(buffer_size);
|
||||
}
|
||||
/*!
|
||||
* \brief set input stream
|
||||
*/
|
||||
inline void set_stream(Stream *stream) {
|
||||
stream_ = stream;
|
||||
read_len_ = read_ptr_ = 1;
|
||||
}
|
||||
/*!
|
||||
* \brief allows quick read using get char
|
||||
*/
|
||||
inline char GetChar(void) {
|
||||
while (true) {
|
||||
if (read_ptr_ < read_len_) {
|
||||
return buffer_[read_ptr_++];
|
||||
} else {
|
||||
read_len_ = stream_->Read(&buffer_[0], buffer_.length());
|
||||
if (read_len_ == 0) return EOF;
|
||||
read_ptr_ = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief whether we are reaching the end of file */
|
||||
inline bool AtEnd(void) const {
|
||||
return read_len_ == 0;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief the underlying stream */
|
||||
Stream *stream_;
|
||||
/*! \brief buffer to hold data */
|
||||
std::string buffer_;
|
||||
/*! \brief length of valid data in buffer */
|
||||
size_t read_len_;
|
||||
/*! \brief pointer in the buffer */
|
||||
size_t read_ptr_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_IO_BUFFER_READER_INL_H_
|
||||
@ -1,112 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_FILE_INL_H_
|
||||
#define RABIT_LEARN_IO_FILE_INL_H_
|
||||
/*!
|
||||
* \file file-inl.h
|
||||
* \brief normal filesystem I/O
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include "./io.h"
|
||||
#include "./line_split-inl.h"
|
||||
|
||||
/*! \brief io interface */
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream : public utils::SeekStream {
|
||||
public:
|
||||
explicit FileStream(const char *fname, const char *mode)
|
||||
: use_stdio(false) {
|
||||
using namespace std;
|
||||
#ifndef RABIT_STRICT_CXX98_
|
||||
if (!strcmp(fname, "stdin")) {
|
||||
use_stdio = true; fp = stdin;
|
||||
}
|
||||
if (!strcmp(fname, "stdout")) {
|
||||
use_stdio = true; fp = stdout;
|
||||
}
|
||||
#endif
|
||||
if (!strncmp(fname, "file://", 7)) fname += 7;
|
||||
if (!use_stdio) {
|
||||
std::string flag = mode;
|
||||
if (flag == "w") flag = "wb";
|
||||
if (flag == "r") flag = "rb";
|
||||
fp = utils::FopenCheck(fname, flag.c_str());
|
||||
}
|
||||
}
|
||||
virtual ~FileStream(void) {
|
||||
this->Close();
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return std::fread(ptr, 1, size, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
std::fwrite(ptr, size, 1, fp);
|
||||
}
|
||||
virtual void Seek(size_t pos) {
|
||||
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
|
||||
}
|
||||
virtual size_t Tell(void) {
|
||||
return std::ftell(fp);
|
||||
}
|
||||
virtual bool AtEnd(void) const {
|
||||
return std::feof(fp) != 0;
|
||||
}
|
||||
inline void Close(void) {
|
||||
if (fp != NULL && !use_stdio) {
|
||||
std::fclose(fp); fp = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::FILE *fp;
|
||||
bool use_stdio;
|
||||
};
|
||||
|
||||
/*! \brief line split from normal file system */
|
||||
class FileProvider : public LineSplitter::IFileProvider {
|
||||
public:
|
||||
explicit FileProvider(const char *uri) {
|
||||
LineSplitter::SplitNames(&fnames_, uri, "#");
|
||||
std::vector<size_t> fsize;
|
||||
for (size_t i = 0; i < fnames_.size(); ++i) {
|
||||
if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
|
||||
std::string tmp = fnames_[i].c_str() + 7;
|
||||
fnames_[i] = tmp;
|
||||
}
|
||||
size_t fz = GetFileSize(fnames_[i].c_str());
|
||||
if (fz != 0) {
|
||||
fsize_.push_back(fz);
|
||||
}
|
||||
}
|
||||
}
|
||||
// destrucor
|
||||
virtual ~FileProvider(void) {}
|
||||
virtual utils::SeekStream *Open(size_t file_index) {
|
||||
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
|
||||
return new FileStream(fnames_[file_index].c_str(), "rb");
|
||||
}
|
||||
virtual const std::vector<size_t> &FileSize(void) const {
|
||||
return fsize_;
|
||||
}
|
||||
private:
|
||||
// file sizes
|
||||
std::vector<size_t> fsize_;
|
||||
// file names
|
||||
std::vector<std::string> fnames_;
|
||||
// get file size
|
||||
inline static size_t GetFileSize(const char *fname) {
|
||||
std::FILE *fp = utils::FopenCheck(fname, "rb");
|
||||
// NOTE: fseek may not be good, but serves as ok solution
|
||||
std::fseek(fp, 0, SEEK_END);
|
||||
size_t fsize = static_cast<size_t>(std::ftell(fp));
|
||||
std::fclose(fp);
|
||||
return fsize;
|
||||
}
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_IO_FILE_INL_H_
|
||||
|
||||
@ -1,165 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_HDFS_INL_H_
|
||||
#define RABIT_LEARN_IO_HDFS_INL_H_
|
||||
/*!
|
||||
* \file hdfs-inl.h
|
||||
* \brief HDFS I/O
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <string>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <hdfs.h>
|
||||
#include <errno.h>
|
||||
#include "./io.h"
|
||||
#include "./line_split-inl.h"
|
||||
|
||||
/*! \brief io interface */
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
class HDFSStream : public SeekStream {
|
||||
public:
|
||||
HDFSStream(hdfsFS fs,
|
||||
const char *fname,
|
||||
const char *mode,
|
||||
bool disconnect_when_done)
|
||||
: fs_(fs), at_end_(false),
|
||||
disconnect_when_done_(disconnect_when_done) {
|
||||
int flag = 0;
|
||||
if (!strcmp(mode, "r")) {
|
||||
flag = O_RDONLY;
|
||||
} else if (!strcmp(mode, "w")) {
|
||||
flag = O_WRONLY;
|
||||
} else if (!strcmp(mode, "a")) {
|
||||
flag = O_WRONLY | O_APPEND;
|
||||
} else {
|
||||
utils::Error("HDFSStream: unknown flag %s", mode);
|
||||
}
|
||||
fp_ = hdfsOpenFile(fs_, fname, flag, 0, 0, 0);
|
||||
utils::Check(fp_ != NULL,
|
||||
"HDFSStream: fail to open %s", fname);
|
||||
}
|
||||
virtual ~HDFSStream(void) {
|
||||
this->Close();
|
||||
if (disconnect_when_done_) {
|
||||
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
|
||||
}
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
tSize nread = hdfsRead(fs_, fp_, ptr, size);
|
||||
if (nread == -1) {
|
||||
int errsv = errno;
|
||||
utils::Error("HDFSStream.Read Error:%s", strerror(errsv));
|
||||
}
|
||||
if (nread == 0) {
|
||||
at_end_ = true;
|
||||
}
|
||||
return static_cast<size_t>(nread);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
const char *buf = reinterpret_cast<const char*>(ptr);
|
||||
while (size != 0) {
|
||||
tSize nwrite = hdfsWrite(fs_, fp_, buf, size);
|
||||
if (nwrite == -1) {
|
||||
int errsv = errno;
|
||||
utils::Error("HDFSStream.Write Error:%s", strerror(errsv));
|
||||
}
|
||||
size_t sz = static_cast<size_t>(nwrite);
|
||||
buf += sz; size -= sz;
|
||||
}
|
||||
}
|
||||
virtual void Seek(size_t pos) {
|
||||
if (hdfsSeek(fs_, fp_, pos) != 0) {
|
||||
int errsv = errno;
|
||||
utils::Error("HDFSStream.Seek Error:%s", strerror(errsv));
|
||||
}
|
||||
}
|
||||
virtual size_t Tell(void) {
|
||||
tOffset offset = hdfsTell(fs_, fp_);
|
||||
if (offset == -1) {
|
||||
int errsv = errno;
|
||||
utils::Error("HDFSStream.Tell Error:%s", strerror(errsv));
|
||||
}
|
||||
return static_cast<size_t>(offset);
|
||||
}
|
||||
virtual bool AtEnd(void) const {
|
||||
return at_end_;
|
||||
}
|
||||
inline void Close(void) {
|
||||
if (fp_ != NULL) {
|
||||
if (hdfsCloseFile(fs_, fp_) == -1) {
|
||||
int errsv = errno;
|
||||
utils::Error("HDFSStream.Close Error:%s", strerror(errsv));
|
||||
}
|
||||
fp_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
inline static std::string GetNameNode(void) {
|
||||
const char *nn = getenv("rabit_hdfs_namenode");
|
||||
if (nn == NULL) {
|
||||
return std::string("default");
|
||||
} else {
|
||||
return std::string(nn);
|
||||
}
|
||||
}
|
||||
private:
|
||||
hdfsFS fs_;
|
||||
hdfsFile fp_;
|
||||
bool at_end_;
|
||||
bool disconnect_when_done_;
|
||||
};
|
||||
|
||||
/*! \brief line split from normal file system */
|
||||
class HDFSProvider : public LineSplitter::IFileProvider {
|
||||
public:
|
||||
explicit HDFSProvider(const char *uri) {
|
||||
fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
|
||||
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
|
||||
std::vector<std::string> paths;
|
||||
LineSplitter::SplitNames(&paths, uri, "#");
|
||||
// get the files
|
||||
for (size_t i = 0; i < paths.size(); ++i) {
|
||||
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
|
||||
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
|
||||
if (info->mKind == 'D') {
|
||||
int nentry;
|
||||
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
|
||||
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
|
||||
for (int i = 0; i < nentry; ++i) {
|
||||
if (files[i].mKind == 'F' && files[i].mSize != 0) {
|
||||
fsize_.push_back(files[i].mSize);
|
||||
fnames_.push_back(std::string(files[i].mName));
|
||||
}
|
||||
}
|
||||
hdfsFreeFileInfo(files, nentry);
|
||||
} else {
|
||||
if (info->mSize != 0) {
|
||||
fsize_.push_back(info->mSize);
|
||||
fnames_.push_back(std::string(info->mName));
|
||||
}
|
||||
}
|
||||
hdfsFreeFileInfo(info, 1);
|
||||
}
|
||||
}
|
||||
virtual ~HDFSProvider(void) {
|
||||
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
|
||||
}
|
||||
virtual const std::vector<size_t> &FileSize(void) const {
|
||||
return fsize_;
|
||||
}
|
||||
virtual SeekStream *Open(size_t file_index) {
|
||||
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
|
||||
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
|
||||
}
|
||||
|
||||
private:
|
||||
// hdfs handle
|
||||
hdfsFS fs_;
|
||||
// file sizes
|
||||
std::vector<size_t> fsize_;
|
||||
// file names
|
||||
std::vector<std::string> fnames_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_IO_HDFS_INL_H_
|
||||
@ -1,99 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_IO_INL_H_
|
||||
#define RABIT_LEARN_IO_IO_INL_H_
|
||||
/*!
|
||||
* \file io-inl.h
|
||||
* \brief Input/Output utils that handles read/write
|
||||
* of files in distrubuted enviroment
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <cstring>
|
||||
|
||||
#include "./io.h"
|
||||
|
||||
#if RABIT_USE_WORMHOLE == 0
|
||||
#if RABIT_USE_HDFS
|
||||
#include "./hdfs-inl.h"
|
||||
#endif
|
||||
#include "./file-inl.h"
|
||||
#endif
|
||||
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
/*!
|
||||
* \brief create input split given a uri
|
||||
* \param uri the uri of the input, can contain hdfs prefix
|
||||
* \param part the part id of current input
|
||||
* \param nsplit total number of splits
|
||||
*/
|
||||
inline InputSplit *CreateInputSplit(const char *uri,
|
||||
unsigned part,
|
||||
unsigned nsplit) {
|
||||
#if RABIT_USE_WORMHOLE
|
||||
return dmlc::InputSplit::Create(uri, part, nsplit);
|
||||
#else
|
||||
using namespace std;
|
||||
if (!strcmp(uri, "stdin")) {
|
||||
return new SingleFileSplit(uri);
|
||||
}
|
||||
if (!strncmp(uri, "file://", 7)) {
|
||||
return new LineSplitter(new FileProvider(uri), part, nsplit);
|
||||
}
|
||||
if (!strncmp(uri, "hdfs://", 7)) {
|
||||
#if RABIT_USE_HDFS
|
||||
return new LineSplitter(new HDFSProvider(uri), part, nsplit);
|
||||
#else
|
||||
utils::Error("Please compile with RABIT_USE_HDFS=1");
|
||||
#endif
|
||||
}
|
||||
return new LineSplitter(new FileProvider(uri), part, nsplit);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename TStream>
|
||||
class StreamAdapter : public Stream {
|
||||
public:
|
||||
explicit StreamAdapter(TStream *stream)
|
||||
: stream_(stream) {
|
||||
}
|
||||
virtual ~StreamAdapter(void) {
|
||||
delete stream_;
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return stream_->Read(ptr, size);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
stream_->Write(ptr, size);
|
||||
}
|
||||
private:
|
||||
TStream *stream_;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief create an stream, the stream must be able to close
|
||||
* the underlying resources(files) when deleted
|
||||
*
|
||||
* \param uri the uri of the input, can contain hdfs prefix
|
||||
* \param mode can be 'w' or 'r' for read or write
|
||||
*/
|
||||
inline Stream *CreateStream(const char *uri, const char *mode) {
|
||||
#if RABIT_USE_WORMHOLE
|
||||
return new StreamAdapter<dmlc::Stream>(dmlc::Stream::Create(uri, mode));
|
||||
#else
|
||||
using namespace std;
|
||||
if (!strncmp(uri, "file://", 7)) {
|
||||
return new FileStream(uri + 7, mode);
|
||||
}
|
||||
if (!strncmp(uri, "hdfs://", 7)) {
|
||||
#if RABIT_USE_HDFS
|
||||
return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
|
||||
uri, mode, true);
|
||||
#else
|
||||
utils::Error("Please compile with RABIT_USE_HDFS=1");
|
||||
#endif
|
||||
}
|
||||
return new FileStream(uri, mode);
|
||||
#endif
|
||||
}
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_IO_IO_INL_H_
|
||||
@ -1,74 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_IO_H_
|
||||
#define RABIT_LEARN_IO_IO_H_
|
||||
/*!
|
||||
* \file io.h
|
||||
* \brief Input/Output utils that handles read/write
|
||||
* of files in distrubuted enviroment
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "../../include/rabit_serializable.h"
|
||||
|
||||
/*! \brief whether compile with HDFS support */
|
||||
#ifndef RABIT_USE_HDFS
|
||||
#define RABIT_USE_HDFS 0
|
||||
#endif
|
||||
|
||||
#ifndef RABIT_USE_WORMHOLE
|
||||
#define RABIT_USE_WORMHOLE 0
|
||||
#endif
|
||||
|
||||
#if RABIT_USE_WORMHOLE
|
||||
#include <dmlc/io.h>
|
||||
#endif
|
||||
/*! \brief io interface */
|
||||
namespace rabit {
|
||||
/*!
|
||||
* \brief namespace to handle input split and filesystem interfacing
|
||||
*/
|
||||
namespace io {
|
||||
/*! \brief reused SeekStream's definition */
|
||||
#if RABIT_USE_WORMHOLE
|
||||
typedef dmlc::SeekStream SeekStream;
|
||||
typedef dmlc::InputSplit InputSplit;
|
||||
#else
|
||||
typedef utils::SeekStream SeekStream;
|
||||
/*!
|
||||
* \brief user facing input split helper,
|
||||
* can be used to get the partition of data used by current node
|
||||
*/
|
||||
class InputSplit {
|
||||
public:
|
||||
/*!
|
||||
* \brief get next line, store into out_data
|
||||
* \param out_data the string that stores the line data,
|
||||
* \n is not included
|
||||
* \return true of next line was found, false if we read all the lines
|
||||
*/
|
||||
virtual bool ReadLine(std::string *out_data) = 0;
|
||||
/*! \brief destructor*/
|
||||
virtual ~InputSplit(void) {}
|
||||
};
|
||||
#endif
|
||||
/*!
|
||||
* \brief create input split given a uri
|
||||
* \param uri the uri of the input, can contain hdfs prefix
|
||||
* \param part the part id of current input
|
||||
* \param nsplit total number of splits
|
||||
*/
|
||||
inline InputSplit *CreateInputSplit(const char *uri,
|
||||
unsigned part,
|
||||
unsigned nsplit);
|
||||
/*!
|
||||
* \brief create an stream, the stream must be able to close
|
||||
* the underlying resources(files) when deleted
|
||||
*
|
||||
* \param uri the uri of the input, can contain hdfs prefix
|
||||
* \param mode can be 'w' or 'r' for read or write
|
||||
*/
|
||||
inline Stream *CreateStream(const char *uri, const char *mode);
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
|
||||
#include "./io-inl.h"
|
||||
#include "./base64-inl.h"
|
||||
#endif // RABIT_LEARN_IO_IO_H_
|
||||
@ -1,206 +0,0 @@
|
||||
#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||
#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||
/*!
|
||||
* \std::FILE line_split-inl.h
|
||||
* \brief base implementation of line-spliter
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include "../../include/rabit.h"
|
||||
#include "./io.h"
|
||||
#include "./buffer_reader-inl.h"
|
||||
|
||||
namespace rabit {
|
||||
namespace io {
|
||||
|
||||
/*! \brief class that split the files by line */
|
||||
class LineSplitter : public InputSplit {
|
||||
public:
|
||||
class IFileProvider {
|
||||
public:
|
||||
/*!
|
||||
* \brief get the seek stream of given file_index
|
||||
* \return the corresponding seek stream at head of the stream
|
||||
* the seek stream's resource can be freed by calling delete
|
||||
*/
|
||||
virtual SeekStream *Open(size_t file_index) = 0;
|
||||
/*!
|
||||
* \return const reference to size of each files
|
||||
*/
|
||||
virtual const std::vector<size_t> &FileSize(void) const = 0;
|
||||
// virtual destructor
|
||||
virtual ~IFileProvider() {}
|
||||
};
|
||||
// constructor
|
||||
explicit LineSplitter(IFileProvider *provider,
|
||||
unsigned rank,
|
||||
unsigned nsplit)
|
||||
: provider_(provider), fs_(NULL),
|
||||
reader_(kBufferSize) {
|
||||
this->Init(provider_->FileSize(), rank, nsplit);
|
||||
}
|
||||
// destructor
|
||||
virtual ~LineSplitter() {
|
||||
if (fs_ != NULL) {
|
||||
delete fs_; fs_ = NULL;
|
||||
}
|
||||
// delete provider after destructing the streams
|
||||
delete provider_;
|
||||
}
|
||||
// get next line
|
||||
virtual bool ReadLine(std::string *out_data) {
|
||||
if (file_ptr_ >= file_ptr_end_ &&
|
||||
offset_curr_ >= offset_end_) return false;
|
||||
out_data->clear();
|
||||
while (true) {
|
||||
char c = reader_.GetChar();
|
||||
if (reader_.AtEnd()) {
|
||||
if (out_data->length() != 0) return true;
|
||||
file_ptr_ += 1;
|
||||
if (offset_curr_ >= offset_end_) return false;
|
||||
if (offset_curr_ != file_offset_[file_ptr_]) {
|
||||
utils::Error("warning: FILE size not calculated correctly\n");
|
||||
offset_curr_ = file_offset_[file_ptr_];
|
||||
}
|
||||
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
|
||||
"boundary check");
|
||||
delete fs_;
|
||||
fs_ = provider_->Open(file_ptr_);
|
||||
reader_.set_stream(fs_);
|
||||
} else {
|
||||
++offset_curr_;
|
||||
if (c != '\r' && c != '\n' && c != EOF) {
|
||||
*out_data += c;
|
||||
} else {
|
||||
if (out_data->length() != 0) return true;
|
||||
if (file_ptr_ >= file_ptr_end_ &&
|
||||
offset_curr_ >= offset_end_) return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief split names given
|
||||
* \param out_fname output std::FILE names
|
||||
* \param uri_ the iput uri std::FILE
|
||||
* \param dlm deliminetr
|
||||
*/
|
||||
inline static void SplitNames(std::vector<std::string> *out_fname,
|
||||
const char *uri_,
|
||||
const char *dlm) {
|
||||
std::string uri = uri_;
|
||||
char *p = std::strtok(BeginPtr(uri), dlm);
|
||||
while (p != NULL) {
|
||||
out_fname->push_back(std::string(p));
|
||||
p = std::strtok(NULL, dlm);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/*!
|
||||
* \brief initialize the line spliter,
|
||||
* \param file_size, size of each files
|
||||
* \param rank the current rank of the data
|
||||
* \param nsplit number of split we will divide the data into
|
||||
*/
|
||||
inline void Init(const std::vector<size_t> &file_size,
|
||||
unsigned rank, unsigned nsplit) {
|
||||
file_offset_.resize(file_size.size() + 1);
|
||||
file_offset_[0] = 0;
|
||||
for (size_t i = 0; i < file_size.size(); ++i) {
|
||||
file_offset_[i + 1] = file_offset_[i] + file_size[i];
|
||||
}
|
||||
size_t ntotal = file_offset_.back();
|
||||
size_t nstep = (ntotal + nsplit - 1) / nsplit;
|
||||
offset_begin_ = std::min(nstep * rank, ntotal);
|
||||
offset_end_ = std::min(nstep * (rank + 1), ntotal);
|
||||
offset_curr_ = offset_begin_;
|
||||
if (offset_begin_ == offset_end_) return;
|
||||
file_ptr_ = std::upper_bound(file_offset_.begin(),
|
||||
file_offset_.end(),
|
||||
offset_begin_) - file_offset_.begin() - 1;
|
||||
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
|
||||
file_offset_.end(),
|
||||
offset_end_) - file_offset_.begin() - 1;
|
||||
fs_ = provider_->Open(file_ptr_);
|
||||
reader_.set_stream(fs_);
|
||||
// try to set the starting position correctly
|
||||
if (file_offset_[file_ptr_] != offset_begin_) {
|
||||
fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
|
||||
while (true) {
|
||||
char c = reader_.GetChar();
|
||||
if (!reader_.AtEnd()) ++offset_curr_;
|
||||
if (c == '\n' || c == '\r' || c == EOF) return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief FileProvider */
|
||||
IFileProvider *provider_;
|
||||
/*! \brief current input stream */
|
||||
utils::SeekStream *fs_;
|
||||
/*! \brief file pointer of which file to read on */
|
||||
size_t file_ptr_;
|
||||
/*! \brief file pointer where the end of file lies */
|
||||
size_t file_ptr_end_;
|
||||
/*! \brief get the current offset */
|
||||
size_t offset_curr_;
|
||||
/*! \brief beginning of offset */
|
||||
size_t offset_begin_;
|
||||
/*! \brief end of the offset */
|
||||
size_t offset_end_;
|
||||
/*! \brief byte-offset of each file */
|
||||
std::vector<size_t> file_offset_;
|
||||
/*! \brief buffer reader */
|
||||
StreamBufferReader reader_;
|
||||
/*! \brief buffer size */
|
||||
const static size_t kBufferSize = 256;
|
||||
};
|
||||
|
||||
/*! \brief line split from single std::FILE */
|
||||
class SingleFileSplit : public InputSplit {
|
||||
public:
|
||||
explicit SingleFileSplit(const char *fname) {
|
||||
if (!std::strcmp(fname, "stdin")) {
|
||||
#ifndef RABIT_STRICT_CXX98_
|
||||
use_stdin_ = true; fp_ = stdin;
|
||||
#endif
|
||||
}
|
||||
if (!use_stdin_) {
|
||||
fp_ = utils::FopenCheck(fname, "r");
|
||||
}
|
||||
end_of_file_ = false;
|
||||
}
|
||||
virtual ~SingleFileSplit(void) {
|
||||
if (!use_stdin_) std::fclose(fp_);
|
||||
}
|
||||
virtual bool ReadLine(std::string *out_data) {
|
||||
if (end_of_file_) return false;
|
||||
out_data->clear();
|
||||
while (true) {
|
||||
char c = std::fgetc(fp_);
|
||||
if (c == EOF) {
|
||||
end_of_file_ = true;
|
||||
}
|
||||
if (c != '\r' && c != '\n' && c != EOF) {
|
||||
*out_data += c;
|
||||
} else {
|
||||
if (out_data->length() != 0) return true;
|
||||
if (end_of_file_) return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
std::FILE *fp_;
|
||||
bool use_stdin_;
|
||||
bool end_of_file_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||
2
subtree/rabit/rabit-learn/kmeans/.gitignore
vendored
2
subtree/rabit/rabit-learn/kmeans/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
kmeans
|
||||
*.mpi
|
||||
@ -1,15 +0,0 @@
|
||||
# specify tensor path
|
||||
BIN = kmeans.rabit
|
||||
MOCKBIN= kmeans.mock
|
||||
MPIBIN = kmeans.mpi
|
||||
# objectives that makes up rabit library
|
||||
OBJ = kmeans.o
|
||||
|
||||
# common build script for programs
|
||||
include ../make/common.mk
|
||||
|
||||
# dependenies here
|
||||
kmeans.rabit: kmeans.o lib
|
||||
kmeans.mock: kmeans.o lib
|
||||
kmeans.mpi: kmeans.o libmpi
|
||||
kmeans.o: kmeans.cc ../../src/*.h
|
||||
@ -1,129 +0,0 @@
|
||||
Toolkit
|
||||
====
|
||||
This folder contains some example toolkits developed with rabit to help you get started.
|
||||
|
||||
KMeans
|
||||
====
|
||||
|
||||
## Input File Format
|
||||
KMeans uses LIBSVM format to parse the input. If you are not familiar with LIBSVM, <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">here</a> you will find more details.
|
||||
|
||||
The format is the following:
|
||||
|
||||
<label> <index1>:<value1> <index2>:<value2> ...
|
||||
|
||||
where label is a dummy integer value in this case (you can add 1's to every example), index<x> is the index for feature x, and value<x> is the feature x value.
|
||||
|
||||
## Output File Format
|
||||
KMeans currently outputs the centroids as dense vectors. Each line in the output file corresponds to a centroid. The number of lines in the file must match the number of clusters K you specified in the command line.
|
||||
|
||||
## Example
|
||||
|
||||
Let's go over a more detailed example...
|
||||
|
||||
#### Preprocess
|
||||
|
||||
Download the smallwiki dataset used in the Machine Learning for Big Data class at University of Washington.
|
||||
|
||||
http://courses.cs.washington.edu/courses/cse547/14wi/datasets/smallwiki.zip
|
||||
|
||||
Unzip it, you should find three files:
|
||||
* tfidf.txt: each row is in the form of “docid||termid1:tfidf1,termid2:tfidf2,...
|
||||
* dictionary.txt: map of term to termid
|
||||
* cluster0.txt: initial cluster centers. Won't needed.
|
||||
|
||||
The first thing to do is to convert the tfidf file format into the input format rabit supports, i.e. LIBSVM. For that, you can use a simple python script. The following should suffice. You should redirect the output to a file, let's say tfidf.libsvm.
|
||||
|
||||
```python
|
||||
for line in open("tfidf.txt").read().splitlines():
|
||||
example = line.split('|')[1].split(',')
|
||||
example = ' '.join(example)
|
||||
print '%s %s' % (1, example)
|
||||
```
|
||||
#### Compile
|
||||
|
||||
You will then need to build the KMeans program with ```make```, which will produce three binaries:
|
||||
|
||||
* kmeans.mpi: runs on MPI.
|
||||
* kmeans.mock: uses a mock to simulate error conditions for testing purposes.
|
||||
* kmeans.rabit: uses our C++ implementation.
|
||||
|
||||
#### Running with Hadoop
|
||||
|
||||
If you want to run it with Hadoop, you can execute the [./kmeans_hadoop.sh](./kmeans_hadoop.sh) script from your master node in cluster.
|
||||
You will have to edit the file in order to specify the path to the Hadoop Streaming jar. Afterwards, you can execute it with the following arguments (in the exact same order):
|
||||
|
||||
* number of worker nodes in your Hadoop cluster (i.e. number of slave nodes)
|
||||
* path to the input data (HDFS path where you put the preprocessed file in libsvm format)
|
||||
* number of clusters K (let's use 20 for this example)
|
||||
* number of iterations to perform (let's use just 5 iterations)
|
||||
* output path (HDFS path where to store the output data, must be a non-existent folder)
|
||||
|
||||
The current implementation runs for the amount of iterations you specify in the command line argument. If you would like to add some convergence criteria (e.g. when no cluster assignment changes between iterations you stop or something like that) you will have to modify [./kmeans.cc](./kmeans.cc). We leave that as an exercise to the reader :)
|
||||
|
||||
You may have noticed that [./kmeans_hadoop.sh](./kmeans_hadoop.sh) uses kmeans.rabit binary, but you can also use kmeans.mock in order to easily test your system behavior in presence of failures. More on that later.
|
||||
|
||||
Don't forget to copy the preprocessed file into HDFS and create the output folder. For example, inside the bin folder in Hadoop, you can execute the following:
|
||||
|
||||
```bash
|
||||
$ ./hadoop fs -mkdir kmeans
|
||||
$ ./hadoop fs -mkdir kmeans/in
|
||||
$ ./hadoop fs -put tfidf.libsvm kmeans/in
|
||||
$ ./hadoop fs -mkdir kmeans/out
|
||||
```
|
||||
|
||||
#### Running with MPI
|
||||
|
||||
You will need to have a MPI cluster installed, for example OpenMPI. In order to run the program, you can use mpirun to submit the job. This is a non-fault tolerant version as it is backed by MPI.
|
||||
|
||||
|
||||
#### Running with Mock
|
||||
|
||||
As previously mentioned, you can execute the kmeans example, an any of your own, with the mock binary. This will allow you to test error conditions while you are developing your algorithms. As explained in the [Tutorial](../guide), passing the script certain parameters (e.g. mock=0,0,1,0) will cause certain node to exit after calling Allreduce/Broadcast in some iteration.
|
||||
|
||||
You can also run this locally, you will only need to split the input file into several smaller files, each will be used by a particular process in the shared memory environment. You can use some Unix command line tool such as split.
|
||||
|
||||
|
||||
#### Processing Output
|
||||
|
||||
Once the program finishes running, you can fetch the output from HDFS. For example, inside the bin folder in Hadoop, you can execute the following:
|
||||
|
||||
```bash
|
||||
$ ./hadoop fs -get kmeans/out/part-00000 kmeans.out
|
||||
|
||||
```
|
||||
|
||||
Each line of the output file is a centroid in dense format. As this dataset contains the words in dictionary.txt file, you can do some simple post processing to recover the top 10 words of each centroid. Something like this should work:
|
||||
|
||||
```python
|
||||
words = {}
|
||||
for line in open("dictionary.txt").read().splitlines():
|
||||
word, index = line.split(' ')
|
||||
words[int(index)] = word
|
||||
|
||||
from collections import defaultdict
|
||||
clusters = defaultdict(list)
|
||||
cluster_name = 0
|
||||
for line in open("kmeans.out").read().splitlines():
|
||||
line = line.split(' ')
|
||||
clusters[cluster_name].extend(line)
|
||||
cluster_name+=1
|
||||
|
||||
import numpy as np
|
||||
for j, key in enumerate(clusters):
|
||||
elements = clusters[key]
|
||||
array = np.array(elements).astype(np.float32)
|
||||
idx = np.argsort(array)[::-1][:10]
|
||||
ws = []
|
||||
for i in idx:
|
||||
ws.append(words[i])
|
||||
print 'cluster %d = %s' % (j, ' '.join(ws))
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,197 +0,0 @@
|
||||
// this is a test case to test whether rabit can recover model when
|
||||
// facing an exception
|
||||
#include <rabit.h>
|
||||
#include <rabit/utils.h>
|
||||
#include <time.h>
|
||||
#include "../utils/data.h"
|
||||
|
||||
using namespace rabit;
|
||||
|
||||
// simple dense matrix, mshadow or Eigen matrix was better
|
||||
// this was was OK
|
||||
struct Matrix {
|
||||
inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
|
||||
this->nrow = nrow;
|
||||
this->ncol = ncol;
|
||||
data.resize(nrow * ncol);
|
||||
std::fill(data.begin(), data.end(), v);
|
||||
}
|
||||
inline float *operator[](size_t i) {
|
||||
return &data[0] + i * ncol;
|
||||
}
|
||||
inline const float *operator[](size_t i) const {
|
||||
return &data[0] + i * ncol;
|
||||
}
|
||||
inline void Print(utils::Stream *fo) {
|
||||
for (size_t i = 0; i < data.size(); ++i) {
|
||||
std::ostringstream ss;
|
||||
ss << data[i];
|
||||
if ((i+1) % ncol == 0) {
|
||||
ss << '\n';
|
||||
} else {
|
||||
ss << ' ';
|
||||
}
|
||||
}
|
||||
std::string s = ss.str();
|
||||
}
|
||||
// number of data
|
||||
size_t nrow, ncol;
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
// kmeans model
|
||||
class Model : public rabit::Serializable {
|
||||
public:
|
||||
// matrix of centroids
|
||||
Matrix centroids;
|
||||
// load from stream
|
||||
virtual void Load(rabit::Stream *fi) {
|
||||
fi->Read(¢roids.nrow, sizeof(centroids.nrow));
|
||||
fi->Read(¢roids.ncol, sizeof(centroids.ncol));
|
||||
fi->Read(¢roids.data);
|
||||
}
|
||||
/*! \brief save the model to the stream */
|
||||
virtual void Save(rabit::Stream *fo) const {
|
||||
fo->Write(¢roids.nrow, sizeof(centroids.nrow));
|
||||
fo->Write(¢roids.ncol, sizeof(centroids.ncol));
|
||||
fo->Write(centroids.data);
|
||||
}
|
||||
virtual void InitModel(unsigned num_cluster, unsigned feat_dim) {
|
||||
centroids.Init(num_cluster, feat_dim);
|
||||
}
|
||||
// normalize L2 norm
|
||||
inline void Normalize(void) {
|
||||
for (size_t i = 0; i < centroids.nrow; ++i) {
|
||||
float *row = centroids[i];
|
||||
double wsum = 0.0;
|
||||
for (size_t j = 0; j < centroids.ncol; ++j) {
|
||||
wsum += row[j] * row[j];
|
||||
}
|
||||
wsum = sqrt(wsum);
|
||||
if (wsum < 1e-6) return;
|
||||
float winv = 1.0 / wsum;
|
||||
for (size_t j = 0; j < centroids.ncol; ++j) {
|
||||
row[j] *= winv;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
inline void InitCentroids(const SparseMat &data, Matrix *centroids) {
|
||||
int num_cluster = centroids->nrow;
|
||||
for (int i = 0; i < num_cluster; ++i) {
|
||||
int index = Random(data.NumRow());
|
||||
SparseMat::Vector v = data[index];
|
||||
for (unsigned j = 0; j < v.length; ++j) {
|
||||
(*centroids)[i][v[j].findex] = v[j].fvalue;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < num_cluster; ++i) {
|
||||
int proc = Random(rabit::GetWorldSize());
|
||||
rabit::Broadcast((*centroids)[i], centroids->ncol * sizeof(float), proc);
|
||||
}
|
||||
}
|
||||
|
||||
inline double Cos(const float *row,
|
||||
const SparseMat::Vector &v) {
|
||||
double rdot = 0.0, rnorm = 0.0;
|
||||
for (unsigned i = 0; i < v.length; ++i) {
|
||||
rdot += row[v[i].findex] * v[i].fvalue;
|
||||
rnorm += v[i].fvalue * v[i].fvalue;
|
||||
}
|
||||
return rdot / sqrt(rnorm);
|
||||
}
|
||||
inline size_t GetCluster(const Matrix ¢roids,
|
||||
const SparseMat::Vector &v) {
|
||||
size_t imin = 0;
|
||||
double dmin = Cos(centroids[0], v);
|
||||
for (size_t k = 1; k < centroids.nrow; ++k) {
|
||||
double dist = Cos(centroids[k], v);
|
||||
if (dist > dmin) {
|
||||
dmin = dist; imin = k;
|
||||
}
|
||||
}
|
||||
return imin;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc < 5) {
|
||||
// intialize rabit engine
|
||||
rabit::Init(argc, argv);
|
||||
if (rabit::GetRank() == 0) {
|
||||
rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
}
|
||||
clock_t tStart = clock();
|
||||
|
||||
srand(0);
|
||||
// load the data
|
||||
SparseMat data;
|
||||
data.Load(argv[1]);
|
||||
// set the parameters
|
||||
int num_cluster = atoi(argv[2]);
|
||||
int max_iter = atoi(argv[3]);
|
||||
// intialize rabit engine
|
||||
rabit::Init(argc, argv);
|
||||
// load model
|
||||
Model model;
|
||||
int iter = rabit::LoadCheckPoint(&model);
|
||||
if (iter == 0) {
|
||||
rabit::Allreduce<op::Max>(&data.feat_dim, 1);
|
||||
model.InitModel(num_cluster, data.feat_dim);
|
||||
InitCentroids(data, &model.centroids);
|
||||
model.Normalize();
|
||||
rabit::TrackerPrintf("[%d] start at %s\n",
|
||||
rabit::GetRank(), rabit::GetProcessorName().c_str());
|
||||
} else {
|
||||
rabit::TrackerPrintf("[%d] restart iter=%d\n", rabit::GetRank(), iter);
|
||||
}
|
||||
const unsigned num_feat = data.feat_dim;
|
||||
// matrix to store the result
|
||||
Matrix temp;
|
||||
for (int r = iter; r < max_iter; ++r) {
|
||||
temp.Init(num_cluster, num_feat + 1, 0.0f);
|
||||
#if __cplusplus >= 201103L
|
||||
auto lazy_get_centroid = [&]()
|
||||
#endif
|
||||
{
|
||||
// lambda function used to calculate the data if necessary
|
||||
// this function may not be called when the result can be directly recovered
|
||||
const size_t ndata = data.NumRow();
|
||||
for (size_t i = 0; i < ndata; ++i) {
|
||||
SparseMat::Vector v = data[i];
|
||||
size_t k = GetCluster(model.centroids, v);
|
||||
// temp[k] += v
|
||||
for (size_t j = 0; j < v.length; ++j) {
|
||||
temp[k][v[j].findex] += v[j].fvalue;
|
||||
}
|
||||
// use last column to record counts
|
||||
temp[k][num_feat] += 1.0f;
|
||||
}
|
||||
};
|
||||
// call allreduce
|
||||
#if __cplusplus >= 201103L
|
||||
rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size(), lazy_get_centroid);
|
||||
#else
|
||||
rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size());
|
||||
#endif
|
||||
// set number
|
||||
for (int k = 0; k < num_cluster; ++k) {
|
||||
float cnt = temp[k][num_feat];
|
||||
utils::Check(cnt != 0.0f, "get zero sized cluster");
|
||||
for (unsigned i = 0; i < num_feat; ++i) {
|
||||
model.centroids[k][i] = temp[k][i] / cnt;
|
||||
}
|
||||
}
|
||||
model.Normalize();
|
||||
rabit::LazyCheckPoint(&model);
|
||||
}
|
||||
// output the model file to somewhere
|
||||
if (rabit::GetRank() == 0) {
|
||||
model.centroids.Print(argv[4]);
|
||||
}
|
||||
rabit::TrackerPrintf("[%d] Time taken: %f seconds\n", rabit::GetRank(), static_cast<float>(clock() - tStart) / CLOCKS_PER_SEC);
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
}
|
||||
@ -1,9 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" -lt 5 ];
|
||||
then
|
||||
echo "Usage: <nslaves> <input_data> <ncluster> <max_iteration> <output>"
|
||||
exit -1
|
||||
fi
|
||||
#set path to hadoop streaming jar here
|
||||
STREAMING_JAR=
|
||||
python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5 kmeans.rabit stdin $3 $4 stdout
|
||||
2
subtree/rabit/rabit-learn/linear/.gitignore
vendored
2
subtree/rabit/rabit-learn/linear/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
mushroom.row*
|
||||
*.model
|
||||
@ -1,21 +0,0 @@
|
||||
ifneq ("$(wildcard ../config.mk)","")
|
||||
config = ../config.mk
|
||||
else
|
||||
config = ../make/config.mk
|
||||
endif
|
||||
include $(config)
|
||||
|
||||
BIN = linear.rabit
|
||||
MOCKBIN= linear.mock
|
||||
MPIBIN =
|
||||
# objectives that makes up rabit library
|
||||
OBJ = linear.o
|
||||
|
||||
# common build script for programs
|
||||
include ../make/common.mk
|
||||
CFLAGS+=-fopenmp
|
||||
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
|
||||
# dependenies here
|
||||
linear.rabit: linear.o lib
|
||||
linear.mock: linear.o lib
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
Linear and Logistic Regression
|
||||
====
|
||||
* input format: LibSVM
|
||||
* Local Example: [run-linear.sh](run-linear.sh)
|
||||
* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
|
||||
- You will need to have YARN
|
||||
- Modify ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
|
||||
- Run build.sh on [../../yarn](../../yarn) on to build yarn jar file
|
||||
|
||||
Multi-Threading Optimization
|
||||
====
|
||||
* The code can be multi-threaded, we encourage you to use it
|
||||
- Simply add ```nthread=k``` where k is the number of threads you want to use
|
||||
* If you submit with YARN
|
||||
- Use ```--vcores``` and ```-mem``` to request CPU and memory resources
|
||||
- Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
|
||||
* Usually multi-threading improves speed in general
|
||||
- You can use less workers and assign more resources to each of worker
|
||||
- This usually means less communication overhead and faster running time
|
||||
|
||||
Parameters
|
||||
====
|
||||
All the parameters can be set by param=value
|
||||
|
||||
#### Important Parameters
|
||||
* objective [default = logistic]
|
||||
- can be linear or logistic
|
||||
* base_score [default = 0.5]
|
||||
- global bias, recommended set to mean value of label
|
||||
* reg_L1 [default = 0]
|
||||
- l1 regularization co-efficient
|
||||
* reg_L2 [default = 1]
|
||||
- l2 regularization co-efficient
|
||||
* lbfgs_stop_tol [default = 1e-5]
|
||||
- relative tolerance level of loss reduction with respect to initial loss
|
||||
* max_lbfgs_iter [default = 500]
|
||||
- maximum number of lbfgs iterations
|
||||
|
||||
### Optimization Related parameters
|
||||
* min_lbfgs_iter [default = 5]
|
||||
- minimum number of lbfgs iterations
|
||||
* max_linesearch_iter [default = 100]
|
||||
- maximum number of iterations in linesearch
|
||||
* linesearch_c1 [default = 1e-4]
|
||||
- c1 co-efficient in backoff linesearch
|
||||
* linesarch_backoff [default = 0.5]
|
||||
- backoff ratio in linesearch
|
||||
|
||||
@ -1,227 +0,0 @@
|
||||
#include "./linear.h"
|
||||
#include "../io/io.h"
|
||||
|
||||
namespace rabit {
|
||||
namespace linear {
|
||||
class LinearObjFunction : public solver::IObjFunction<float> {
|
||||
public:
|
||||
// training threads
|
||||
int nthread;
|
||||
// L2 regularization
|
||||
float reg_L2;
|
||||
// model
|
||||
LinearModel model;
|
||||
// training data
|
||||
SparseMat dtrain;
|
||||
// solver
|
||||
solver::LBFGSSolver<float> lbfgs;
|
||||
// constructor
|
||||
LinearObjFunction(void) {
|
||||
lbfgs.SetObjFunction(this);
|
||||
nthread = 1;
|
||||
reg_L2 = 0.0f;
|
||||
model.weight = NULL;
|
||||
task = "train";
|
||||
model_in = "NULL";
|
||||
name_pred = "pred.txt";
|
||||
model_out = "final.model";
|
||||
}
|
||||
virtual ~LinearObjFunction(void) {
|
||||
}
|
||||
// set parameters
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
model.param.SetParam(name, val);
|
||||
lbfgs.SetParam(name, val);
|
||||
if (!strcmp(name, "num_feature")) {
|
||||
char ndigit[30];
|
||||
sprintf(ndigit, "%lu", model.param.num_feature + 1);
|
||||
lbfgs.SetParam("num_dim", ndigit);
|
||||
}
|
||||
if (!strcmp(name, "reg_L2")) {
|
||||
reg_L2 = static_cast<float>(atof(val));
|
||||
}
|
||||
if (!strcmp(name, "nthread")) {
|
||||
nthread = atoi(val);
|
||||
}
|
||||
if (!strcmp(name, "task")) task = val;
|
||||
if (!strcmp(name, "model_in")) model_in = val;
|
||||
if (!strcmp(name, "model_out")) model_out = val;
|
||||
if (!strcmp(name, "name_pred")) name_pred = val;
|
||||
}
|
||||
inline void Run(void) {
|
||||
if (model_in != "NULL") {
|
||||
this->LoadModel(model_in.c_str());
|
||||
}
|
||||
if (task == "train") {
|
||||
lbfgs.Run();
|
||||
if (rabit::GetRank() == 0) {
|
||||
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
|
||||
}
|
||||
} else if (task == "pred") {
|
||||
this->TaskPred();
|
||||
} else {
|
||||
utils::Error("unknown task=%s", task.c_str());
|
||||
}
|
||||
}
|
||||
inline void TaskPred(void) {
|
||||
utils::Check(model_in != "NULL",
|
||||
"must set model_in for task=pred");
|
||||
FILE *fp = utils::FopenCheck(name_pred.c_str(), "w");
|
||||
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
|
||||
float pred = model.Predict(dtrain[i]);
|
||||
fprintf(fp, "%g\n", pred);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Finishing writing to %s\n", name_pred.c_str());
|
||||
}
|
||||
inline void LoadModel(const char *fname) {
|
||||
Stream *fi = io::CreateStream(fname, "r");
|
||||
std::string header; header.resize(4);
|
||||
// check header for different binary encode
|
||||
// can be base64 or binary
|
||||
utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
|
||||
// base64 format
|
||||
if (header == "bs64") {
|
||||
io::Base64InStream bsin(fi);
|
||||
bsin.InitPosition();
|
||||
model.Load(&bsin);
|
||||
} else if (header == "binf") {
|
||||
model.Load(fi);
|
||||
} else {
|
||||
utils::Error("invalid model file");
|
||||
}
|
||||
delete fi;
|
||||
}
|
||||
inline void SaveModel(const char *fname,
|
||||
const float *wptr,
|
||||
bool save_base64 = false) {
|
||||
Stream *fo = io::CreateStream(fname, "w");
|
||||
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
|
||||
fo->Write("bs64\t", 5);
|
||||
io::Base64OutStream bout(fo);
|
||||
model.Save(&bout, wptr);
|
||||
bout.Finish('\n');
|
||||
} else {
|
||||
fo->Write("binf", 4);
|
||||
model.Save(fo, wptr);
|
||||
}
|
||||
delete fo;
|
||||
}
|
||||
inline void LoadData(const char *fname) {
|
||||
dtrain.Load(fname);
|
||||
}
|
||||
virtual size_t InitNumDim(void) {
|
||||
if (model_in == "NULL") {
|
||||
size_t ndim = dtrain.feat_dim;
|
||||
rabit::Allreduce<rabit::op::Max>(&ndim, 1);
|
||||
model.param.num_feature = std::max(ndim, model.param.num_feature);
|
||||
}
|
||||
return model.param.num_feature + 1;
|
||||
}
|
||||
virtual void InitModel(float *weight, size_t size) {
|
||||
if (model_in == "NULL") {
|
||||
memset(weight, 0.0f, size * sizeof(float));
|
||||
model.param.InitBaseScore();
|
||||
} else {
|
||||
rabit::Broadcast(model.weight, size * sizeof(float), 0);
|
||||
memcpy(weight, model.weight, size * sizeof(float));
|
||||
}
|
||||
}
|
||||
// load model
|
||||
virtual void Load(rabit::Stream *fi) {
|
||||
fi->Read(&model.param, sizeof(model.param));
|
||||
}
|
||||
virtual void Save(rabit::Stream *fo) const {
|
||||
fo->Write(&model.param, sizeof(model.param));
|
||||
}
|
||||
virtual double Eval(const float *weight, size_t size) {
|
||||
if (nthread != 0) omp_set_num_threads(nthread);
|
||||
utils::Check(size == model.param.num_feature + 1,
|
||||
"size consistency check");
|
||||
double sum_val = 0.0;
|
||||
#pragma omp parallel for schedule(static) reduction(+:sum_val)
|
||||
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
|
||||
float py = model.param.PredictMargin(weight, dtrain[i]);
|
||||
float fv = model.param.MarginToLoss(dtrain.labels[i], py);
|
||||
sum_val += fv;
|
||||
}
|
||||
if (rabit::GetRank() == 0) {
|
||||
// only add regularization once
|
||||
if (reg_L2 != 0.0f) {
|
||||
double sum_sqr = 0.0;
|
||||
for (size_t i = 0; i < model.param.num_feature; ++i) {
|
||||
sum_sqr += weight[i] * weight[i];
|
||||
}
|
||||
sum_val += 0.5 * reg_L2 * sum_sqr;
|
||||
}
|
||||
}
|
||||
utils::Check(!std::isnan(sum_val), "nan occurs");
|
||||
return sum_val;
|
||||
}
|
||||
virtual void CalcGrad(float *out_grad,
|
||||
const float *weight,
|
||||
size_t size) {
|
||||
if (nthread != 0) omp_set_num_threads(nthread);
|
||||
utils::Check(size == model.param.num_feature + 1,
|
||||
"size consistency check");
|
||||
memset(out_grad, 0.0f, sizeof(float) * size);
|
||||
double sum_gbias = 0.0;
|
||||
#pragma omp parallel for schedule(static) reduction(+:sum_gbias)
|
||||
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
|
||||
SparseMat::Vector v = dtrain[i];
|
||||
float py = model.param.Predict(weight, v);
|
||||
float grad = model.param.PredToGrad(dtrain.labels[i], py);
|
||||
for (index_t j = 0; j < v.length; ++j) {
|
||||
out_grad[v[j].findex] += v[j].fvalue * grad;
|
||||
}
|
||||
sum_gbias += grad;
|
||||
}
|
||||
out_grad[model.param.num_feature] = static_cast<float>(sum_gbias);
|
||||
if (rabit::GetRank() == 0) {
|
||||
// only add regularization once
|
||||
if (reg_L2 != 0.0f) {
|
||||
for (size_t i = 0; i < model.param.num_feature; ++i) {
|
||||
out_grad[i] += reg_L2 * weight[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::string task;
|
||||
std::string model_in;
|
||||
std::string model_out;
|
||||
std::string name_pred;
|
||||
};
|
||||
} // namespace linear
|
||||
} // namespace rabit
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc < 2) {
|
||||
// intialize rabit engine
|
||||
rabit::Init(argc, argv);
|
||||
if (rabit::GetRank() == 0) {
|
||||
rabit::TrackerPrintf("Usage: <data_in> param=val\n");
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
}
|
||||
rabit::linear::LinearObjFunction *linear = new rabit::linear::LinearObjFunction();
|
||||
if (!strcmp(argv[1], "stdin")) {
|
||||
linear->LoadData(argv[1]);
|
||||
rabit::Init(argc, argv);
|
||||
} else {
|
||||
rabit::Init(argc, argv);
|
||||
linear->LoadData(argv[1]);
|
||||
}
|
||||
for (int i = 2; i < argc; ++i) {
|
||||
char name[256], val[256];
|
||||
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
|
||||
linear->SetParam(name, val);
|
||||
}
|
||||
}
|
||||
linear->Run();
|
||||
delete linear;
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
}
|
||||
@ -1,134 +0,0 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file linear.h
|
||||
* \brief Linear and Logistic regression
|
||||
*
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef RABIT_LINEAR_H_
|
||||
#define RABIT_LINEAR_H_
|
||||
#include <omp.h>
|
||||
#include "../utils/data.h"
|
||||
#include "../solver/lbfgs.h"
|
||||
|
||||
namespace rabit {
|
||||
namespace linear {
|
||||
/*! \brief simple linear model */
|
||||
struct LinearModel {
|
||||
struct ModelParam {
|
||||
/*! \brief global bias */
|
||||
float base_score;
|
||||
/*! \brief number of features */
|
||||
size_t num_feature;
|
||||
/*! \brief loss type*/
|
||||
int loss_type;
|
||||
// reserved field
|
||||
int reserved[16];
|
||||
// constructor
|
||||
ModelParam(void) {
|
||||
memset(this, 0, sizeof(ModelParam));
|
||||
base_score = 0.5f;
|
||||
num_feature = 0;
|
||||
loss_type = 1;
|
||||
num_feature = 0;
|
||||
}
|
||||
// initialize base score
|
||||
inline void InitBaseScore(void) {
|
||||
utils::Check(base_score > 0.0f && base_score < 1.0f,
|
||||
"base_score must be in (0,1) for logistic loss");
|
||||
base_score = -std::log(1.0f / base_score - 1.0f);
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp("base_score", name)) {
|
||||
base_score = static_cast<float>(atof(val));
|
||||
}
|
||||
if (!strcmp("num_feature", name)) {
|
||||
num_feature = static_cast<size_t>(atol(val));
|
||||
}
|
||||
if (!strcmp("objective", name)) {
|
||||
if (!strcmp("linear", val)) {
|
||||
loss_type = 0;
|
||||
} else if (!strcmp("logistic", val)) {
|
||||
loss_type = 1;
|
||||
} else {
|
||||
utils::Error("unknown objective type %s\n", val);
|
||||
}
|
||||
}
|
||||
}
|
||||
// transform margin to prediction
|
||||
inline float MarginToPred(float margin) const {
|
||||
if (loss_type == 1) {
|
||||
return 1.0f / (1.0f + std::exp(-margin));
|
||||
} else {
|
||||
return margin;
|
||||
}
|
||||
}
|
||||
// margin to loss
|
||||
inline float MarginToLoss(float label, float margin) const {
|
||||
if (loss_type == 1) {
|
||||
float nlogprob;
|
||||
if (margin > 0.0f) {
|
||||
nlogprob = std::log(1.0f + std::exp(-margin));
|
||||
} else {
|
||||
nlogprob = -margin + std::log(1.0f + std::exp(margin));
|
||||
}
|
||||
return label * nlogprob +
|
||||
(1.0f -label) * (margin + nlogprob);
|
||||
} else {
|
||||
float diff = margin - label;
|
||||
return 0.5f * diff * diff;
|
||||
}
|
||||
}
|
||||
inline float PredToGrad(float label, float pred) const {
|
||||
return pred - label;
|
||||
}
|
||||
inline float PredictMargin(const float *weight,
|
||||
const SparseMat::Vector &v) const {
|
||||
// weight[num_feature] is bias
|
||||
float sum = base_score + weight[num_feature];
|
||||
for (unsigned i = 0; i < v.length; ++i) {
|
||||
if (v[i].findex >= num_feature) continue;
|
||||
sum += weight[v[i].findex] * v[i].fvalue;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
inline float Predict(const float *weight,
|
||||
const SparseMat::Vector &v) const {
|
||||
return MarginToPred(PredictMargin(weight, v));
|
||||
}
|
||||
};
|
||||
// model parameter
|
||||
ModelParam param;
|
||||
// weight corresponding to the model
|
||||
float *weight;
|
||||
LinearModel(void) : weight(NULL) {
|
||||
}
|
||||
~LinearModel(void) {
|
||||
if (weight != NULL) delete [] weight;
|
||||
}
|
||||
// load model
|
||||
inline void Load(rabit::Stream *fi) {
|
||||
fi->Read(¶m, sizeof(param));
|
||||
if (weight == NULL) {
|
||||
weight = new float[param.num_feature + 1];
|
||||
}
|
||||
fi->Read(weight, sizeof(float) * (param.num_feature + 1));
|
||||
}
|
||||
inline void Save(rabit::Stream *fo, const float *wptr = NULL) {
|
||||
fo->Write(¶m, sizeof(param));
|
||||
if (wptr == NULL) wptr = weight;
|
||||
fo->Write(wptr, sizeof(float) * (param.num_feature + 1));
|
||||
}
|
||||
inline float Predict(const SparseMat::Vector &v) const {
|
||||
return param.Predict(weight, v);
|
||||
}
|
||||
};
|
||||
} // namespace linear
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LINEAR_H_
|
||||
@ -1,20 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" -lt 3 ];
|
||||
then
|
||||
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# put the local training file to HDFS
|
||||
hadoop fs -rm -r -f $2/data
|
||||
hadoop fs -rm -r -f $2/mushroom.linear.model
|
||||
hadoop fs -mkdir $2/data
|
||||
hadoop fs -put ../data/agaricus.txt.train $2/data
|
||||
|
||||
# submit to hadoop
|
||||
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
||||
|
||||
# get the final model file
|
||||
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
|
||||
|
||||
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
|
||||
@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [[ $# -lt 1 ]]
|
||||
then
|
||||
echo "Usage: nprocess"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
rm -rf *.model
|
||||
k=$1
|
||||
|
||||
../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1
|
||||
@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [[ $# -lt 1 ]]
|
||||
then
|
||||
echo "Usage: nprocess"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
rm -rf *.model
|
||||
k=$1
|
||||
|
||||
# run linear model, the program will automatically split the inputs
|
||||
../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1
|
||||
|
||||
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
|
||||
@ -1,20 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" -lt 3 ];
|
||||
then
|
||||
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# put the local training file to HDFS
|
||||
hadoop fs -rm -r -f $2/mushroom.linear.model
|
||||
|
||||
hadoop fs -mkdir $2/data
|
||||
hadoop fs -put ../data/agaricus.txt.train $2/data
|
||||
|
||||
# submit to hadoop
|
||||
../../wormhole/tracker/dmlc_yarn.py -n $1 --vcores 1 ./linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
|
||||
|
||||
# get the final model file
|
||||
hadoop fs -get $2/mushroom.linear.model ./linear.model
|
||||
|
||||
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
|
||||
@ -1,49 +0,0 @@
|
||||
# this is the common build script for rabit programs
|
||||
# you do not have to use it
|
||||
export LDFLAGS= -L../../lib -pthread -lm -lrt
|
||||
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fPIC -I../../include
|
||||
|
||||
# setup opencv
|
||||
ifeq ($(USE_DMLC),1)
|
||||
include ../../dmlc-core/make/dmlc.mk
|
||||
CFLAGS+= -DRABIT_USE_DMLC=1 -I ../../dmlc-core/include $(DMLC_CFLAGS)
|
||||
LDFLAGS+= -L../../dmlc-core -ldmlc $(DMLC_LDFLAGS)
|
||||
else
|
||||
CFLAGS+= -DRABIT_USE_DMLC=0
|
||||
endif
|
||||
|
||||
# setup opencv
|
||||
ifeq ($(USE_HDFS),1)
|
||||
CFLAGS+= -DRABIT_USE_HDFS=1 -I$(HADOOP_HDFS_HOME)/include -I$(JAVA_HOME)/include
|
||||
LDFLAGS+= -L$(HADOOP_HDFS_HOME)/lib/native -L$(LIBJVM) -lhdfs -ljvm
|
||||
else
|
||||
CFLAGS+= -DRABIT_USE_HDFS=0
|
||||
endif
|
||||
|
||||
|
||||
.PHONY: clean all lib mpi
|
||||
|
||||
all: $(BIN) $(MOCKBIN)
|
||||
|
||||
mpi: $(MPIBIN)
|
||||
|
||||
lib:
|
||||
cd ../..;make lib/librabit.a lib/librabit_mock.a; cd -
|
||||
libmpi:
|
||||
cd ../..;make lib/librabit_mpi.a;cd -
|
||||
|
||||
|
||||
$(BIN) :
|
||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit $(LDFLAGS)
|
||||
|
||||
$(MOCKBIN) :
|
||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit_mock $(LDFLAGS)
|
||||
|
||||
$(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||
|
||||
$(MPIBIN) :
|
||||
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~
|
||||
@ -1,24 +0,0 @@
|
||||
#-----------------------------------------------------
|
||||
# rabit-learn: the configuration compile script
|
||||
#
|
||||
# This is the default configuration setup for rabit-learn
|
||||
# If you want to change configuration, do the following steps:
|
||||
#
|
||||
# - copy this file to the root of rabit-learn folder
|
||||
# - modify the configuration you want
|
||||
# - type make or make -j n on each of the folder
|
||||
#----------------------------------------------------
|
||||
|
||||
# choice of compiler
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
export MPICXX = mpicxx
|
||||
|
||||
# whether use HDFS support during compile
|
||||
USE_HDFS = 1
|
||||
|
||||
# whether use dmlc's io utils
|
||||
USE_DMLC = 0
|
||||
|
||||
# path to libjvm.so
|
||||
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
|
||||
@ -1,669 +0,0 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file lbfgs.h
|
||||
* \brief L-BFGS solver for general optimization problem
|
||||
*
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef RABIT_LEARN_LBFGS_H_
|
||||
#define RABIT_LEARN_LBFGS_H_
|
||||
#include <cmath>
|
||||
#include <rabit.h>
|
||||
|
||||
namespace rabit {
|
||||
/*! \brief namespace of solver for general problems */
|
||||
namespace solver {
|
||||
/*!
|
||||
* \brief objective function for optimizers
|
||||
* the objective function can also implement save/load
|
||||
* to remember the state parameters that might need to remember
|
||||
*/
|
||||
template<typename DType>
|
||||
class IObjFunction : public rabit::Serializable {
|
||||
public:
|
||||
// destructor
|
||||
virtual ~IObjFunction(void){}
|
||||
/*!
|
||||
* \brief evaluate function values for a given weight
|
||||
* \param weight weight of the function
|
||||
* \param size size of the weight
|
||||
*/
|
||||
virtual double Eval(const DType *weight, size_t size) = 0;
|
||||
/*!
|
||||
* \return number of feature dimension to be allocated
|
||||
* only called once during initialization
|
||||
*/
|
||||
virtual size_t InitNumDim(void) = 0;
|
||||
/*!
|
||||
* \brief initialize the weight before starting the solver
|
||||
* only called once for initialization
|
||||
*/
|
||||
virtual void InitModel(DType *weight, size_t size) = 0;
|
||||
/*!
|
||||
* \brief calculate gradient for a given weight
|
||||
* \param out_grad used to store the gradient value of the function
|
||||
* \param weight weight of the function
|
||||
* \param size size of the weight
|
||||
*/
|
||||
virtual void CalcGrad(DType *out_grad,
|
||||
const DType *weight,
|
||||
size_t size) = 0;
|
||||
};
|
||||
|
||||
/*! \brief a basic version L-BFGS solver */
|
||||
template<typename DType>
|
||||
class LBFGSSolver {
|
||||
public:
|
||||
LBFGSSolver(void) {
|
||||
// set default values
|
||||
reg_L1 = 0.0f;
|
||||
max_linesearch_iter = 100;
|
||||
linesearch_backoff = 0.5f;
|
||||
linesearch_c1 = 1e-4;
|
||||
min_lbfgs_iter = 5;
|
||||
max_lbfgs_iter = 500;
|
||||
lbfgs_stop_tol = 1e-5f;
|
||||
silent = 0;
|
||||
}
|
||||
virtual ~LBFGSSolver(void) {}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("num_dim", name)) {
|
||||
gstate.num_dim = static_cast<size_t>(atol(val));
|
||||
}
|
||||
if (!strcmp("size_memory", name)) {
|
||||
gstate.size_memory = static_cast<size_t>(atol(val));
|
||||
}
|
||||
if (!strcmp("reg_L1", name)) {
|
||||
reg_L1 = static_cast<float>(atof(val));
|
||||
}
|
||||
if (!strcmp("lbfgs_stop_tol", name)) {
|
||||
lbfgs_stop_tol = static_cast<float>(atof(val));
|
||||
}
|
||||
if (!strcmp("linesearch_backoff", name)) {
|
||||
linesearch_backoff = static_cast<float>(atof(val));
|
||||
}
|
||||
if (!strcmp("max_linesearch_iter", name)) {
|
||||
max_linesearch_iter = atoi(val);
|
||||
}
|
||||
if (!strcmp("max_lbfgs_iter", name)) {
|
||||
max_lbfgs_iter = atoi(val);
|
||||
}
|
||||
if (!strcmp("min_lbfgs_iter", name)) {
|
||||
min_lbfgs_iter = atoi(val);
|
||||
}
|
||||
if (!strcmp("linesearch_c1", name)) {
|
||||
linesearch_c1 = static_cast<float>(atof(val));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief set objective function to optimize
|
||||
* the objective function only need to evaluate and calculate
|
||||
* gradient with respect to current subset of data
|
||||
* \param obj the objective function we are looking for
|
||||
*/
|
||||
virtual void SetObjFunction(IObjFunction<DType> *obj) {
|
||||
gstate.obj = obj;
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the LBFGS solver
|
||||
* user must already set the objective function
|
||||
*/
|
||||
virtual void Init(void) {
|
||||
utils::Check(gstate.obj != NULL,
|
||||
"LBFGSSolver.Init must SetObjFunction first");
|
||||
int version = rabit::LoadCheckPoint(&gstate, &hist);
|
||||
if (version == 0) {
|
||||
gstate.num_dim = gstate.obj->InitNumDim();
|
||||
} else {
|
||||
printf("restart from version=%d\n", version);
|
||||
}
|
||||
{
|
||||
// decide parameter partition
|
||||
size_t nproc = rabit::GetWorldSize();
|
||||
size_t rank = rabit::GetRank();
|
||||
size_t step = (gstate.num_dim + nproc - 1) / nproc;
|
||||
// upper align
|
||||
step = (step + 7) / 8 * 8;
|
||||
utils::Assert(step * nproc >= gstate.num_dim, "BUG");
|
||||
range_begin_ = std::min(rank * step, gstate.num_dim);
|
||||
range_end_ = std::min((rank + 1) * step, gstate.num_dim);
|
||||
}
|
||||
if (version == 0) {
|
||||
gstate.Init();
|
||||
hist.Init(range_end_ - range_begin_, gstate.size_memory);
|
||||
gstate.obj->InitModel(gstate.weight, gstate.num_dim);
|
||||
// broadcast initialize model
|
||||
rabit::Broadcast(gstate.weight,
|
||||
sizeof(DType) * gstate.num_dim, 0);
|
||||
gstate.old_objval = this->Eval(gstate.weight);
|
||||
gstate.init_objval = gstate.old_objval;
|
||||
|
||||
if (silent == 0 && rabit::GetRank() == 0) {
|
||||
rabit::TrackerPrintf
|
||||
("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu, RAM-approx=%lu\n",
|
||||
gstate.num_dim, gstate.init_objval, gstate.size_memory,
|
||||
gstate.MemCost() + hist.MemCost());
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief get the current weight vector
|
||||
* note that if update function is called
|
||||
* the content of weight vector is no longer valid
|
||||
* \return weight vector
|
||||
*/
|
||||
virtual DType *GetWeight(void) {
|
||||
return gstate.weight;
|
||||
}
|
||||
/*!
|
||||
* \brief update the weight for one L-BFGS iteration
|
||||
* \return whether stopping condition is met
|
||||
*/
|
||||
virtual bool UpdateOneIter(void) {
|
||||
bool stop = false;
|
||||
GlobalState &g = gstate;
|
||||
g.obj->CalcGrad(g.grad, g.weight, g.num_dim);
|
||||
rabit::Allreduce<rabit::op::Sum>(g.grad, g.num_dim);
|
||||
// find change direction
|
||||
double vdot = FindChangeDirection(g.tempw, g.grad, g.weight);
|
||||
// line-search, g.grad is now new weight
|
||||
int iter = BacktrackLineSearch(g.grad, g.tempw, g.weight, vdot);
|
||||
utils::Check(iter < max_linesearch_iter, "line search failed");
|
||||
// swap new weight
|
||||
std::swap(g.weight, g.grad);
|
||||
// check stop condition
|
||||
if (gstate.num_iteration > static_cast<size_t>(min_lbfgs_iter)) {
|
||||
if (g.old_objval - g.new_objval < lbfgs_stop_tol * g.init_objval) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (silent == 0 && rabit::GetRank() == 0) {
|
||||
rabit::TrackerPrintf
|
||||
("[%d] L-BFGS: linesearch finishes in %d rounds, new_objval=%g, improvment=%g\n",
|
||||
gstate.num_iteration, iter,
|
||||
gstate.new_objval,
|
||||
gstate.old_objval - gstate.new_objval);
|
||||
}
|
||||
gstate.old_objval = gstate.new_objval;
|
||||
rabit::CheckPoint(&gstate, &hist);
|
||||
return stop;
|
||||
}
|
||||
/*! \brief run optimization */
|
||||
virtual void Run(void) {
|
||||
this->Init();
|
||||
while (gstate.num_iteration < static_cast<size_t>(max_lbfgs_iter)) {
|
||||
if (this->UpdateOneIter()) break;
|
||||
}
|
||||
if (silent == 0 && rabit::GetRank() == 0) {
|
||||
size_t nonzero = 0;
|
||||
for (size_t i = 0; i < gstate.num_dim; ++i) {
|
||||
if (gstate.weight[i] != 0.0f) nonzero += 1;
|
||||
}
|
||||
rabit::TrackerPrintf
|
||||
("L-BFGS: finishes at iteration %d, %lu/%lu active weights\n",
|
||||
gstate.num_iteration, nonzero, gstate.num_dim);
|
||||
}
|
||||
}
|
||||
protected:
|
||||
// find the delta value, given gradient
|
||||
// return dot(dir, l1grad)
|
||||
virtual double FindChangeDirection(DType *dir,
|
||||
const DType *grad,
|
||||
const DType *weight) {
|
||||
int m = static_cast<int>(gstate.size_memory);
|
||||
int n = static_cast<int>(hist.num_useful());
|
||||
if (n < m) {
|
||||
utils::Assert(hist.num_useful() == gstate.num_iteration,
|
||||
"BUG2, n=%d, it=%d", n, gstate.num_iteration);
|
||||
} else {
|
||||
utils::Assert(n == m, "BUG3");
|
||||
}
|
||||
const size_t num_dim = gstate.num_dim;
|
||||
const DType *gsub = grad + range_begin_;
|
||||
const size_t nsub = range_end_ - range_begin_;
|
||||
double vdot = 0.0;
|
||||
if (n != 0) {
|
||||
// hist[m + n - 1] stores old gradient
|
||||
Minus(hist[m + n - 1], gsub, hist[m + n - 1], nsub);
|
||||
SetL1Dir(hist[2 * m], gsub, weight + range_begin_, nsub);
|
||||
// index set for calculating results
|
||||
std::vector<std::pair<size_t, size_t> > idxset;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
idxset.push_back(std::make_pair(j, 2 * m));
|
||||
idxset.push_back(std::make_pair(j, n - 1));
|
||||
idxset.push_back(std::make_pair(j, m + n - 1));
|
||||
}
|
||||
for (int j = 0; j < n; ++j) {
|
||||
idxset.push_back(std::make_pair(m + j, 2 * m));
|
||||
idxset.push_back(std::make_pair(m + j, m + n - 1));
|
||||
}
|
||||
|
||||
// calculate dot products
|
||||
std::vector<double> tmp(idxset.size());
|
||||
for (size_t i = 0; i < tmp.size(); ++i) {
|
||||
tmp[i] = hist.CalcDot(idxset[i].first, idxset[i].second);
|
||||
}
|
||||
|
||||
rabit::Allreduce<rabit::op::Sum>(BeginPtr(tmp), tmp.size());
|
||||
|
||||
for (size_t i = 0; i < tmp.size(); ++i) {
|
||||
gstate.DotBuf(idxset[i].first, idxset[i].second) = tmp[i];
|
||||
}
|
||||
|
||||
// BFGS steps, use vector-free update
|
||||
// parameterize vector using basis in hist
|
||||
std::vector<double> alpha(n);
|
||||
std::vector<double> delta(2 * m + 1, 0.0);
|
||||
delta[2 * m] = 1.0;
|
||||
// backward step
|
||||
for (int j = n - 1; j >= 0; --j) {
|
||||
double vsum = 0.0;
|
||||
for (size_t k = 0; k < delta.size(); ++k) {
|
||||
vsum += delta[k] * gstate.DotBuf(k, j);
|
||||
}
|
||||
alpha[j] = vsum / gstate.DotBuf(j, m + j);
|
||||
delta[m + j] = delta[m + j] - alpha[j];
|
||||
}
|
||||
// scale
|
||||
double scale = gstate.DotBuf(n - 1, m + n - 1) /
|
||||
gstate.DotBuf(m + n - 1, m + n - 1);
|
||||
for (size_t k = 0; k < delta.size(); ++k) {
|
||||
delta[k] *= scale;
|
||||
}
|
||||
// forward step
|
||||
for (int j = 0; j < n; ++j) {
|
||||
double vsum = 0.0;
|
||||
for (size_t k = 0; k < delta.size(); ++k) {
|
||||
vsum += delta[k] * gstate.DotBuf(k, m + j);
|
||||
}
|
||||
double beta = vsum / gstate.DotBuf(j, m + j);
|
||||
delta[j] = delta[j] + (alpha[j] - beta);
|
||||
}
|
||||
|
||||
// set all to zero
|
||||
std::fill(dir, dir + num_dim, 0.0f);
|
||||
DType *dirsub = dir + range_begin_;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
AddScale(dirsub, dirsub, hist[m + i], delta[m + i], nsub);
|
||||
}
|
||||
AddScale(dirsub, dirsub, hist[2 * m], delta[2 * m], nsub);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
AddScale(dirsub, dirsub, hist[i], delta[i], nsub);
|
||||
}
|
||||
FixDirL1Sign(dirsub, hist[2 * m], nsub);
|
||||
vdot = -Dot(dirsub, hist[2 * m], nsub);
|
||||
|
||||
// allreduce to get full direction
|
||||
rabit::Allreduce<rabit::op::Sum>(dir, num_dim);
|
||||
rabit::Allreduce<rabit::op::Sum>(&vdot, 1);
|
||||
} else {
|
||||
SetL1Dir(dir, grad, weight, num_dim);
|
||||
vdot = -Dot(dir, dir, num_dim);
|
||||
}
|
||||
// shift the history record
|
||||
if (n < m) {
|
||||
n += 1;
|
||||
} else {
|
||||
gstate.Shift(); hist.Shift();
|
||||
}
|
||||
hist.set_num_useful(n);
|
||||
// copy gradient to hist[m + n - 1]
|
||||
memcpy(hist[m + n - 1], gsub, nsub * sizeof(DType));
|
||||
return vdot;
|
||||
}
|
||||
// line search for given direction
|
||||
// return whether there is a descent
|
||||
inline int BacktrackLineSearch(DType *new_weight,
|
||||
const DType *dir,
|
||||
const DType *weight,
|
||||
double dot_dir_l1grad) {
|
||||
utils::Assert(dot_dir_l1grad < 0.0f,
|
||||
"gradient error, dotv=%g", dot_dir_l1grad);
|
||||
double alpha = 1.0;
|
||||
double backoff = linesearch_backoff;
|
||||
// unit descent direction in first iter
|
||||
if (gstate.num_iteration == 0) {
|
||||
utils::Assert(hist.num_useful() == 1, "hist.nuseful");
|
||||
alpha = 1.0f / std::sqrt(-dot_dir_l1grad);
|
||||
backoff = 0.1f;
|
||||
}
|
||||
int iter = 0;
|
||||
|
||||
double old_val = gstate.old_objval;
|
||||
double c1 = this->linesearch_c1;
|
||||
while (true) {
|
||||
const size_t num_dim = gstate.num_dim;
|
||||
if (++iter >= max_linesearch_iter) return iter;
|
||||
AddScale(new_weight, weight, dir, alpha, num_dim);
|
||||
this->FixWeightL1Sign(new_weight, weight, num_dim);
|
||||
double new_val = this->Eval(new_weight);
|
||||
if (new_val - old_val <= c1 * dot_dir_l1grad * alpha) {
|
||||
gstate.new_objval = new_val; break;
|
||||
}
|
||||
alpha *= backoff;
|
||||
}
|
||||
// hist[n - 1] = new_weight - weight
|
||||
Minus(hist[hist.num_useful() - 1],
|
||||
new_weight + range_begin_,
|
||||
weight + range_begin_,
|
||||
range_end_ - range_begin_);
|
||||
gstate.num_iteration += 1;
|
||||
return iter;
|
||||
}
|
||||
// OWL-QN step for L1 regularization
|
||||
inline void SetL1Dir(DType *dst,
|
||||
const DType *grad,
|
||||
const DType *weight,
|
||||
size_t size) {
|
||||
if (reg_L1 == 0.0) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
dst[i] = -grad[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (weight[i] > 0.0f) {
|
||||
dst[i] = -grad[i] - reg_L1;
|
||||
} else if (weight[i] < 0.0f) {
|
||||
dst[i] = -grad[i] + reg_L1;
|
||||
} else {
|
||||
if (grad[i] < -reg_L1) {
|
||||
dst[i] = -grad[i] - reg_L1;
|
||||
} else if (grad[i] > reg_L1) {
|
||||
dst[i] = -grad[i] + reg_L1;
|
||||
} else {
|
||||
dst[i] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// OWL-QN step: fix direction sign to be consistent with proposal
|
||||
inline void FixDirL1Sign(DType *dir,
|
||||
const DType *steepdir,
|
||||
size_t size) {
|
||||
if (reg_L1 != 0.0f) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (dir[i] * steepdir[i] <= 0.0f) {
|
||||
dir[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// QWL-QN step: fix direction sign to be consistent with proposal
|
||||
inline void FixWeightL1Sign(DType *new_weight,
|
||||
const DType *weight,
|
||||
size_t size) {
|
||||
if (reg_L1 != 0.0f) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (new_weight[i] * weight[i] < 0.0f) {
|
||||
new_weight[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
inline double Eval(const DType *weight) {
|
||||
double val = gstate.obj->Eval(weight, gstate.num_dim);
|
||||
rabit::Allreduce<rabit::op::Sum>(&val, 1);
|
||||
if (reg_L1 != 0.0f) {
|
||||
double l1norm = 0.0;
|
||||
for (size_t i = 0; i < gstate.num_dim; ++i) {
|
||||
l1norm += std::abs(weight[i]);
|
||||
}
|
||||
val += l1norm * reg_L1;
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
private:
|
||||
// helper functions
|
||||
// dst = lhs + rhs * scale
|
||||
inline static void AddScale(DType *dst,
|
||||
const DType *lhs,
|
||||
const DType *rhs,
|
||||
DType scale,
|
||||
size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
dst[i] = lhs[i] + rhs[i] * scale;
|
||||
}
|
||||
}
|
||||
// dst = lhs - rhs
|
||||
inline static void Minus(DType *dst,
|
||||
const DType *lhs,
|
||||
const DType *rhs,
|
||||
size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
dst[i] = lhs[i] - rhs[i];
|
||||
}
|
||||
}
|
||||
// return dot(lhs, rhs)
|
||||
inline static double Dot(const DType *lhs,
|
||||
const DType *rhs,
|
||||
size_t size) {
|
||||
double res = 0.0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
res += lhs[i] * rhs[i];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
// map rolling array index
|
||||
inline static size_t MapIndex(size_t i, size_t offset,
|
||||
size_t size_memory) {
|
||||
if (i == 2 * size_memory) return i;
|
||||
if (i < size_memory) {
|
||||
return (i + offset) % size_memory;
|
||||
} else {
|
||||
utils::Assert(i < 2 * size_memory,
|
||||
"MapIndex: index exceed bound, i=%lu", i);
|
||||
return (i + offset) % size_memory + size_memory;
|
||||
}
|
||||
}
|
||||
// global solver state
|
||||
struct GlobalState : public rabit::Serializable {
|
||||
public:
|
||||
// memory size of L-BFGS
|
||||
size_t size_memory;
|
||||
// number of iterations passed
|
||||
size_t num_iteration;
|
||||
// number of features in the solver
|
||||
size_t num_dim;
|
||||
// initialize objective value
|
||||
double init_objval;
|
||||
// history objective value
|
||||
double old_objval;
|
||||
// new objective value
|
||||
double new_objval;
|
||||
// objective function
|
||||
IObjFunction<DType> *obj;
|
||||
// temporal storage
|
||||
DType *grad, *weight, *tempw;
|
||||
// constructor
|
||||
GlobalState(void)
|
||||
: obj(NULL), grad(NULL),
|
||||
weight(NULL), tempw(NULL) {
|
||||
size_memory = 10;
|
||||
num_iteration = 0;
|
||||
num_dim = 0;
|
||||
old_objval = 0.0;
|
||||
offset_ = 0;
|
||||
}
|
||||
~GlobalState(void) {
|
||||
if (grad != NULL) {
|
||||
delete [] grad;
|
||||
delete [] weight;
|
||||
delete [] tempw;
|
||||
}
|
||||
}
|
||||
// intilize the space of rolling array
|
||||
inline void Init(void) {
|
||||
size_t n = size_memory * 2 + 1;
|
||||
data.resize(n * n, 0.0);
|
||||
this->AllocSpace();
|
||||
}
|
||||
// memory cost
|
||||
inline size_t MemCost(void) const {
|
||||
return sizeof(DType) * 3 * num_dim;
|
||||
}
|
||||
inline double &DotBuf(size_t i, size_t j) {
|
||||
if (i > j) std::swap(i, j);
|
||||
return data[MapIndex(i, offset_, size_memory) * (size_memory * 2 + 1) +
|
||||
MapIndex(j, offset_, size_memory)];
|
||||
}
|
||||
// load the shift array
|
||||
virtual void Load(rabit::Stream *fi) {
|
||||
fi->Read(&size_memory, sizeof(size_memory));
|
||||
fi->Read(&num_iteration, sizeof(num_iteration));
|
||||
fi->Read(&num_dim, sizeof(num_dim));
|
||||
fi->Read(&init_objval, sizeof(init_objval));
|
||||
fi->Read(&old_objval, sizeof(old_objval));
|
||||
fi->Read(&offset_, sizeof(offset_));
|
||||
fi->Read(&data);
|
||||
this->AllocSpace();
|
||||
fi->Read(weight, sizeof(DType) * num_dim);
|
||||
obj->Load(fi);
|
||||
}
|
||||
// save the shift array
|
||||
virtual void Save(rabit::Stream *fo) const {
|
||||
fo->Write(&size_memory, sizeof(size_memory));
|
||||
fo->Write(&num_iteration, sizeof(num_iteration));
|
||||
fo->Write(&num_dim, sizeof(num_dim));
|
||||
fo->Write(&init_objval, sizeof(init_objval));
|
||||
fo->Write(&old_objval, sizeof(old_objval));
|
||||
fo->Write(&offset_, sizeof(offset_));
|
||||
fo->Write(data);
|
||||
fo->Write(weight, sizeof(DType) * num_dim);
|
||||
obj->Save(fo);
|
||||
}
|
||||
inline void Shift(void) {
|
||||
offset_ = (offset_ + 1) % size_memory;
|
||||
}
|
||||
|
||||
private:
|
||||
// rolling offset in the current memory
|
||||
size_t offset_;
|
||||
std::vector<double> data;
|
||||
// allocate sapce
|
||||
inline void AllocSpace(void) {
|
||||
if (grad == NULL) {
|
||||
grad = new DType[num_dim];
|
||||
weight = new DType[num_dim];
|
||||
tempw = new DType[num_dim];
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief rolling array that carries history information */
|
||||
struct HistoryArray : public rabit::Serializable {
|
||||
public:
|
||||
HistoryArray(void) : dptr_(NULL) {
|
||||
num_useful_ = 0;
|
||||
}
|
||||
~HistoryArray(void) {
|
||||
if (dptr_ != NULL) delete [] dptr_;
|
||||
}
|
||||
// intilize the space of rolling array
|
||||
inline void Init(size_t num_col, size_t size_memory) {
|
||||
if (dptr_ != NULL &&
|
||||
(num_col_ != num_col || size_memory_ != size_memory)) {
|
||||
delete dptr_;
|
||||
}
|
||||
num_col_ = num_col;
|
||||
size_memory_ = size_memory;
|
||||
stride_ = num_col_;
|
||||
offset_ = 0;
|
||||
size_t n = size_memory * 2 + 1;
|
||||
dptr_ = new DType[n * stride_];
|
||||
}
|
||||
// memory cost
|
||||
inline size_t MemCost(void) const {
|
||||
return sizeof(DType) * (size_memory_ * 2 + 1) * stride_;
|
||||
}
|
||||
// fetch element from rolling array
|
||||
inline const DType *operator[](size_t i) const {
|
||||
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
|
||||
}
|
||||
inline DType *operator[](size_t i) {
|
||||
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
|
||||
}
|
||||
// shift array: arr_old -> arr_new
|
||||
// for i in [0, size_memory - 1), arr_new[i] = arr_old[i + 1]
|
||||
// for i in [size_memory, 2 * size_memory - 1), arr_new[i] = arr_old[i + 1]
|
||||
// arr_old[0] and arr_arr[size_memory] will be discarded
|
||||
inline void Shift(void) {
|
||||
offset_ = (offset_ + 1) % size_memory_;
|
||||
}
|
||||
inline double CalcDot(size_t i, size_t j) const {
|
||||
return Dot((*this)[i], (*this)[j], num_col_);
|
||||
}
|
||||
// set number of useful memory
|
||||
inline const size_t &num_useful(void) const {
|
||||
return num_useful_;
|
||||
}
|
||||
// set number of useful memory
|
||||
inline void set_num_useful(size_t num_useful) {
|
||||
utils::Assert(num_useful <= size_memory_,
|
||||
"num_useful exceed bound");
|
||||
num_useful_ = num_useful;
|
||||
}
|
||||
// load the shift array
|
||||
virtual void Load(rabit::Stream *fi) {
|
||||
fi->Read(&num_col_, sizeof(num_col_));
|
||||
fi->Read(&stride_, sizeof(stride_));
|
||||
fi->Read(&size_memory_, sizeof(size_memory_));
|
||||
fi->Read(&num_useful_, sizeof(num_useful_));
|
||||
this->Init(num_col_, size_memory_);
|
||||
for (size_t i = 0; i < num_useful_; ++i) {
|
||||
fi->Read((*this)[i], num_col_ * sizeof(DType));
|
||||
fi->Read((*this)[i + size_memory_], num_col_ * sizeof(DType));
|
||||
}
|
||||
}
|
||||
// save the shift array
|
||||
virtual void Save(rabit::Stream *fo) const {
|
||||
fo->Write(&num_col_, sizeof(num_col_));
|
||||
fo->Write(&stride_, sizeof(stride_));
|
||||
fo->Write(&size_memory_, sizeof(size_memory_));
|
||||
fo->Write(&num_useful_, sizeof(num_useful_));
|
||||
for (size_t i = 0; i < num_useful_; ++i) {
|
||||
fo->Write((*this)[i], num_col_ * sizeof(DType));
|
||||
fo->Write((*this)[i + size_memory_], num_col_ * sizeof(DType));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// number of columns in each of array
|
||||
size_t num_col_;
|
||||
// stride for each of column for alignment
|
||||
size_t stride_;
|
||||
// memory size of L-BFGS
|
||||
size_t size_memory_;
|
||||
// number of useful memory that will be used
|
||||
size_t num_useful_;
|
||||
// rolling offset in the current memory
|
||||
size_t offset_;
|
||||
// data pointer
|
||||
DType *dptr_;
|
||||
};
|
||||
// data structure for LBFGS
|
||||
GlobalState gstate;
|
||||
HistoryArray hist;
|
||||
// silent
|
||||
int silent;
|
||||
// the subrange of current node
|
||||
size_t range_begin_;
|
||||
size_t range_end_;
|
||||
// L1 regularization co-efficient
|
||||
float reg_L1;
|
||||
// c1 ratio for line search
|
||||
float linesearch_c1;
|
||||
float linesearch_backoff;
|
||||
int max_linesearch_iter;
|
||||
int max_lbfgs_iter;
|
||||
int min_lbfgs_iter;
|
||||
float lbfgs_stop_tol;
|
||||
};
|
||||
} // namespace solver
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_LBFGS_H_
|
||||
@ -1,101 +0,0 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file data.h
|
||||
* \brief simple data structure that could be used by model
|
||||
*
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef RABIT_LEARN_DATA_H_
|
||||
#define RABIT_LEARN_DATA_H_
|
||||
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
#include <rabit.h>
|
||||
#include "../io/io.h"
|
||||
|
||||
namespace rabit {
|
||||
// typedef index type
|
||||
typedef unsigned index_t;
|
||||
|
||||
/*! \brief sparse matrix, CSR format */
|
||||
struct SparseMat {
|
||||
// sparse matrix entry
|
||||
struct Entry {
|
||||
// feature index
|
||||
index_t findex;
|
||||
// feature value
|
||||
float fvalue;
|
||||
};
|
||||
// sparse vector
|
||||
struct Vector {
|
||||
const Entry *data;
|
||||
index_t length;
|
||||
inline const Entry &operator[](size_t i) const {
|
||||
return data[i];
|
||||
}
|
||||
};
|
||||
inline Vector operator[](size_t i) const {
|
||||
Vector v;
|
||||
v.data = &data[0] + row_ptr[i];
|
||||
v.length = static_cast<index_t>(row_ptr[i + 1]-row_ptr[i]);
|
||||
return v;
|
||||
}
|
||||
// load data from LibSVM format
|
||||
inline void Load(const char *fname) {
|
||||
io::InputSplit *in =
|
||||
io::CreateInputSplit
|
||||
(fname, rabit::GetRank(),
|
||||
rabit::GetWorldSize());
|
||||
row_ptr.clear();
|
||||
row_ptr.push_back(0);
|
||||
data.clear();
|
||||
feat_dim = 0;
|
||||
std::string line;
|
||||
while (in->ReadLine(&line)) {
|
||||
float label;
|
||||
std::istringstream ss(line);
|
||||
ss >> label;
|
||||
Entry e;
|
||||
unsigned long fidx;
|
||||
while (!ss.eof()) {
|
||||
if (!(ss >> fidx)) break;
|
||||
ss.ignore(32, ':');
|
||||
if (!(ss >> e.fvalue)) break;
|
||||
e.findex = static_cast<index_t>(fidx);
|
||||
data.push_back(e);
|
||||
feat_dim = std::max(fidx, feat_dim);
|
||||
}
|
||||
labels.push_back(label);
|
||||
row_ptr.push_back(data.size());
|
||||
}
|
||||
delete in;
|
||||
feat_dim += 1;
|
||||
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
|
||||
"feature dimension exceed limit of index_t"\
|
||||
"consider change the index_t to unsigned long");
|
||||
}
|
||||
inline size_t NumRow(void) const {
|
||||
return row_ptr.size() - 1;
|
||||
}
|
||||
// memory cost
|
||||
inline size_t MemCost(void) const {
|
||||
return data.size() * sizeof(Entry);
|
||||
}
|
||||
// maximum feature dimension
|
||||
size_t feat_dim;
|
||||
std::vector<size_t> row_ptr;
|
||||
std::vector<Entry> data;
|
||||
std::vector<float> labels;
|
||||
};
|
||||
|
||||
/*!\brief computes a random number modulo the value */
|
||||
inline int Random(int value) {
|
||||
return rand() % value;
|
||||
}
|
||||
} // namespace rabit
|
||||
#endif // RABIT_LEARN_DATA_H_
|
||||
Loading…
x
Reference in New Issue
Block a user