Merge commit '57b5d7873f4f0953357e9d98e9c60cff8373d7ec'
This commit is contained in:
commit
9f7c6fe271
@ -2,7 +2,7 @@ ifndef CXX
|
|||||||
export CXX = g++
|
export CXX = g++
|
||||||
endif
|
endif
|
||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
export LDFLAGS= -Llib
|
export LDFLAGS= -Llib -lrt
|
||||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic
|
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic
|
||||||
export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS)
|
export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS)
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ $(ALIB):
|
|||||||
ar cr $@ $+
|
ar cr $@ $+
|
||||||
|
|
||||||
$(SLIB) :
|
$(SLIB) :
|
||||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^)
|
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
|
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
|
||||||
|
|||||||
@ -13,7 +13,7 @@ All these features comes from the facts about small rabbit:)
|
|||||||
* Portable: rabit is light weight and runs everywhere
|
* Portable: rabit is light weight and runs everywhere
|
||||||
- Rabit is a library instead of a framework, a program only needs to link the library to run
|
- Rabit is a library instead of a framework, a program only needs to link the library to run
|
||||||
- Rabit only replies on a mechanism to start program, which was provided by most framework
|
- Rabit only replies on a mechanism to start program, which was provided by most framework
|
||||||
- You can run rabit programs on many platforms, including Hadoop, MPI using the same code
|
- You can run rabit programs on many platforms, including Yarn(Hadoop), MPI using the same code
|
||||||
* Scalable and Flexible: rabit runs fast
|
* Scalable and Flexible: rabit runs fast
|
||||||
* Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
|
* Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
|
||||||
- Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
|
- Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
|
||||||
|
|||||||
@ -341,12 +341,11 @@ Rabit is a portable library that can run on multiple platforms.
|
|||||||
* This script will restart the program when it exits with -2, so it can be used for [mock test](#link-against-mock-test-library)
|
* This script will restart the program when it exits with -2, so it can be used for [mock test](#link-against-mock-test-library)
|
||||||
|
|
||||||
#### Running Rabit on Hadoop
|
#### Running Rabit on Hadoop
|
||||||
* You can use [../tracker/rabit_hadoop.py](../tracker/rabit_hadoop.py) to run rabit programs on hadoop
|
* You can use [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) to run rabit programs as Yarn application
|
||||||
* This will start n rabit programs as mappers of MapReduce
|
* This will start rabit programs as yarn applications
|
||||||
* Each program can read its portion of data from stdin
|
|
||||||
* Yarn(Hadoop 2.0 or higher) is highly recommended, since Yarn allows specifying number of cpus and memory of each mapper:
|
|
||||||
- This allows multi-threading programs in each node, which can be more efficient
|
- This allows multi-threading programs in each node, which can be more efficient
|
||||||
- An easy multi-threading solution could be to use OpenMP with rabit code
|
- An easy multi-threading solution could be to use OpenMP with rabit code
|
||||||
|
* It is also possible to run rabit program via hadoop streaming, however, YARN is highly recommended.
|
||||||
|
|
||||||
#### Running Rabit using MPI
|
#### Running Rabit using MPI
|
||||||
* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py).
|
* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py).
|
||||||
@ -358,15 +357,15 @@ tracker scripts, such as [../tracker/rabit_hadoop.py](../tracker/rabit_hadoop.py
|
|||||||
|
|
||||||
You will need to implement a platform dependent submission function with the following definition
|
You will need to implement a platform dependent submission function with the following definition
|
||||||
```python
|
```python
|
||||||
def fun_submit(nworkers, worker_args):
|
def fun_submit(nworkers, worker_args, worker_envs):
|
||||||
"""
|
"""
|
||||||
customized submit script, that submits nslave jobs,
|
customized submit script, that submits nslave jobs,
|
||||||
each must contain args as parameter
|
each must contain args as parameter
|
||||||
note this can be a lambda closure
|
note this can be a lambda closure
|
||||||
Parameters
|
Parameters
|
||||||
nworkers number of worker processes to start
|
nworkers number of worker processes to start
|
||||||
worker_args tracker information which must be passed to the arguments
|
worker_args addtiional arguments that needs to be passed to worker
|
||||||
this usually includes the parameters of master_uri and port, etc.
|
worker_envs enviroment variables that need to be set to the worker
|
||||||
"""
|
"""
|
||||||
```
|
```
|
||||||
The submission function should start nworkers processes in the platform, and append worker_args to the end of the other arguments.
|
The submission function should start nworkers processes in the platform, and append worker_args to the end of the other arguments.
|
||||||
@ -374,7 +373,7 @@ Then you can simply call ```tracker.submit``` with fun_submit to submit jobs to
|
|||||||
|
|
||||||
Note that the current rabit tracker does not restart a worker when it dies, the restart of a node is done by the platform, otherwise we should write the fail-restart logic in the custom script.
|
Note that the current rabit tracker does not restart a worker when it dies, the restart of a node is done by the platform, otherwise we should write the fail-restart logic in the custom script.
|
||||||
* Fail-restart is usually provided by most platforms.
|
* Fail-restart is usually provided by most platforms.
|
||||||
* For example, mapreduce will restart a mapper when it fails
|
- rabit-yarn provides such functionality in YARN
|
||||||
|
|
||||||
Fault Tolerance
|
Fault Tolerance
|
||||||
=====
|
=====
|
||||||
|
|||||||
@ -23,6 +23,8 @@ class ISeekStream: public IStream {
|
|||||||
virtual void Seek(size_t pos) = 0;
|
virtual void Seek(size_t pos) = 0;
|
||||||
/*! \brief tell the position of the stream */
|
/*! \brief tell the position of the stream */
|
||||||
virtual size_t Tell(void) = 0;
|
virtual size_t Tell(void) = 0;
|
||||||
|
/*! \return whether we are at end of file */
|
||||||
|
virtual bool AtEnd(void) const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief fixed size memory buffer */
|
/*! \brief fixed size memory buffer */
|
||||||
@ -55,7 +57,9 @@ struct MemoryFixSizeBuffer : public ISeekStream {
|
|||||||
virtual size_t Tell(void) {
|
virtual size_t Tell(void) {
|
||||||
return curr_ptr_;
|
return curr_ptr_;
|
||||||
}
|
}
|
||||||
|
virtual bool AtEnd(void) const {
|
||||||
|
return curr_ptr_ == buffer_size_;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
/*! \brief in memory buffer */
|
/*! \brief in memory buffer */
|
||||||
char *p_buffer_;
|
char *p_buffer_;
|
||||||
@ -95,7 +99,9 @@ struct MemoryBufferStream : public ISeekStream {
|
|||||||
virtual size_t Tell(void) {
|
virtual size_t Tell(void) {
|
||||||
return curr_ptr_;
|
return curr_ptr_;
|
||||||
}
|
}
|
||||||
|
virtual bool AtEnd(void) const {
|
||||||
|
return curr_ptr_ == p_buffer_->length();
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
/*! \brief in memory buffer */
|
/*! \brief in memory buffer */
|
||||||
std::string *p_buffer_;
|
std::string *p_buffer_;
|
||||||
|
|||||||
@ -3,9 +3,13 @@
|
|||||||
* \brief This file defines the utils for timing
|
* \brief This file defines the utils for timing
|
||||||
* \author Tianqi Chen, Nacho, Tianyi
|
* \author Tianqi Chen, Nacho, Tianyi
|
||||||
*/
|
*/
|
||||||
#ifndef RABIT_TIMER_H
|
#ifndef RABIT_TIMER_H_
|
||||||
#define RABIT_TIMER_H
|
#define RABIT_TIMER_H_
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
#ifdef __MACH__
|
||||||
|
#include <mach/clock.h>
|
||||||
|
#include <mach/mach.h>
|
||||||
|
#endif
|
||||||
#include "./utils.h"
|
#include "./utils.h"
|
||||||
|
|
||||||
namespace rabit {
|
namespace rabit {
|
||||||
@ -14,10 +18,19 @@ namespace utils {
|
|||||||
* \brief return time in seconds, not cross platform, avoid to use this in most places
|
* \brief return time in seconds, not cross platform, avoid to use this in most places
|
||||||
*/
|
*/
|
||||||
inline double GetTime(void) {
|
inline double GetTime(void) {
|
||||||
|
#ifdef __MACH__
|
||||||
|
clock_serv_t cclock;
|
||||||
|
mach_timespec_t mts;
|
||||||
|
host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
|
||||||
|
utils::Check(clock_get_time(cclock, &mts) == 0, "failed to get time");
|
||||||
|
mach_port_deallocate(mach_task_self(), cclock);
|
||||||
|
return static_cast<double>(mts.tv_sec) + static_cast<double>(mts.tv_nsec) * 1e-9;
|
||||||
|
#else
|
||||||
timespec ts;
|
timespec ts;
|
||||||
utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
|
utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
|
||||||
return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
|
return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
} // namespace utils
|
||||||
}
|
} // namespace rabit
|
||||||
#endif
|
#endif // RABIT_TIMER_H_
|
||||||
|
|||||||
@ -5,15 +5,13 @@ It also contain links to the Machine Learning packages that uses rabit.
|
|||||||
|
|
||||||
* Contribution of toolkits, examples, benchmarks is more than welcomed!
|
* Contribution of toolkits, examples, benchmarks is more than welcomed!
|
||||||
|
|
||||||
|
|
||||||
Toolkits
|
Toolkits
|
||||||
====
|
====
|
||||||
* [KMeans Clustering](kmeans)
|
* [KMeans Clustering](kmeans)
|
||||||
* [Linear and Logistic Regression](linear)
|
* [Linear and Logistic Regression](linear)
|
||||||
|
|
||||||
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
|
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
|
||||||
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
|
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
|
||||||
10 times faster than existing packages
|
10 times faster than existing packages
|
||||||
- Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
|
- Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
|
||||||
single node version, and scale it to even larger problems
|
single node version, and scale it to even larger problems
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef RABIT_LEARN_UTILS_BASE64_H_
|
#ifndef RABIT_LEARN_IO_BASE64_INL_H_
|
||||||
#define RABIT_LEARN_UTILS_BASE64_H_
|
#define RABIT_LEARN_IO_BASE64_INL_H_
|
||||||
/*!
|
/*!
|
||||||
* \file base64.h
|
* \file base64.h
|
||||||
* \brief data stream support to input and output from/to base64 stream
|
* \brief data stream support to input and output from/to base64 stream
|
||||||
@ -8,10 +8,11 @@
|
|||||||
*/
|
*/
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <rabit/io.h>
|
#include "./io.h"
|
||||||
|
#include "./buffer_reader-inl.h"
|
||||||
|
|
||||||
namespace rabit {
|
namespace rabit {
|
||||||
namespace utils {
|
namespace io {
|
||||||
/*! \brief namespace of base64 decoding and encoding table */
|
/*! \brief namespace of base64 decoding and encoding table */
|
||||||
namespace base64 {
|
namespace base64 {
|
||||||
const char DecodeTable[] = {
|
const char DecodeTable[] = {
|
||||||
@ -34,7 +35,8 @@ static const char EncodeTable[] =
|
|||||||
/*! \brief the stream that reads from base64, note we take from file pointers */
|
/*! \brief the stream that reads from base64, note we take from file pointers */
|
||||||
class Base64InStream: public IStream {
|
class Base64InStream: public IStream {
|
||||||
public:
|
public:
|
||||||
explicit Base64InStream(FILE *fp) : fp(fp) {
|
explicit Base64InStream(IStream *fs) : reader_(256) {
|
||||||
|
reader_.set_stream(fs);
|
||||||
num_prev = 0; tmp_ch = 0;
|
num_prev = 0; tmp_ch = 0;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -44,7 +46,7 @@ class Base64InStream: public IStream {
|
|||||||
inline void InitPosition(void) {
|
inline void InitPosition(void) {
|
||||||
// get a charater
|
// get a charater
|
||||||
do {
|
do {
|
||||||
tmp_ch = fgetc(fp);
|
tmp_ch = reader_.GetChar();
|
||||||
} while (isspace(tmp_ch));
|
} while (isspace(tmp_ch));
|
||||||
}
|
}
|
||||||
/*! \brief whether current position is end of a base64 stream */
|
/*! \brief whether current position is end of a base64 stream */
|
||||||
@ -85,19 +87,19 @@ class Base64InStream: public IStream {
|
|||||||
nvalue = DecodeTable[tmp_ch] << 18;
|
nvalue = DecodeTable[tmp_ch] << 18;
|
||||||
{
|
{
|
||||||
// second byte
|
// second byte
|
||||||
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
||||||
"invalid base64 format");
|
"invalid base64 format");
|
||||||
nvalue |= DecodeTable[tmp_ch] << 12;
|
nvalue |= DecodeTable[tmp_ch] << 12;
|
||||||
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
|
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// third byte
|
// third byte
|
||||||
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
||||||
"invalid base64 format");
|
"invalid base64 format");
|
||||||
// handle termination
|
// handle termination
|
||||||
if (tmp_ch == '=') {
|
if (tmp_ch == '=') {
|
||||||
Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
|
||||||
Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
|
||||||
"invalid base64 format");
|
"invalid base64 format");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -110,10 +112,10 @@ class Base64InStream: public IStream {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
// fourth byte
|
// fourth byte
|
||||||
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
|
||||||
"invalid base64 format");
|
"invalid base64 format");
|
||||||
if (tmp_ch == '=') {
|
if (tmp_ch == '=') {
|
||||||
Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
|
utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
|
||||||
"invalid base64 format");
|
"invalid base64 format");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -125,10 +127,10 @@ class Base64InStream: public IStream {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// get next char
|
// get next char
|
||||||
tmp_ch = fgetc(fp);
|
tmp_ch = reader_.GetChar();
|
||||||
}
|
}
|
||||||
if (kStrictCheck) {
|
if (kStrictCheck) {
|
||||||
Check(tlen == 0, "Base64InStream: read incomplete");
|
utils::Check(tlen == 0, "Base64InStream: read incomplete");
|
||||||
}
|
}
|
||||||
return size - tlen;
|
return size - tlen;
|
||||||
}
|
}
|
||||||
@ -137,7 +139,7 @@ class Base64InStream: public IStream {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
FILE *fp;
|
StreamBufferReader reader_;
|
||||||
int tmp_ch;
|
int tmp_ch;
|
||||||
int num_prev;
|
int num_prev;
|
||||||
unsigned char buf_prev[2];
|
unsigned char buf_prev[2];
|
||||||
@ -147,7 +149,7 @@ class Base64InStream: public IStream {
|
|||||||
/*! \brief the stream that write to base64, note we take from file pointers */
|
/*! \brief the stream that write to base64, note we take from file pointers */
|
||||||
class Base64OutStream: public IStream {
|
class Base64OutStream: public IStream {
|
||||||
public:
|
public:
|
||||||
explicit Base64OutStream(FILE *fp) : fp(fp) {
|
explicit Base64OutStream(IStream *fp) : fp(fp) {
|
||||||
buf_top = 0;
|
buf_top = 0;
|
||||||
}
|
}
|
||||||
virtual void Write(const void *ptr, size_t size) {
|
virtual void Write(const void *ptr, size_t size) {
|
||||||
@ -160,16 +162,16 @@ class Base64OutStream: public IStream {
|
|||||||
}
|
}
|
||||||
if (buf_top == 3) {
|
if (buf_top == 3) {
|
||||||
// flush 4 bytes out
|
// flush 4 bytes out
|
||||||
fputc(EncodeTable[buf[1] >> 2], fp);
|
PutChar(EncodeTable[buf[1] >> 2]);
|
||||||
fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
|
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
|
||||||
fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
|
PutChar(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F]);
|
||||||
fputc(EncodeTable[buf[3] & 0x3F], fp);
|
PutChar(EncodeTable[buf[3] & 0x3F]);
|
||||||
buf_top = 0;
|
buf_top = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual size_t Read(void *ptr, size_t size) {
|
virtual size_t Read(void *ptr, size_t size) {
|
||||||
Error("Base64OutStream do not support read");
|
utils::Error("Base64OutStream do not support read");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -179,26 +181,38 @@ class Base64OutStream: public IStream {
|
|||||||
inline void Finish(char endch = EOF) {
|
inline void Finish(char endch = EOF) {
|
||||||
using base64::EncodeTable;
|
using base64::EncodeTable;
|
||||||
if (buf_top == 1) {
|
if (buf_top == 1) {
|
||||||
fputc(EncodeTable[buf[1] >> 2], fp);
|
PutChar(EncodeTable[buf[1] >> 2]);
|
||||||
fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
|
PutChar(EncodeTable[(buf[1] << 4) & 0x3F]);
|
||||||
fputc('=', fp);
|
PutChar('=');
|
||||||
fputc('=', fp);
|
PutChar('=');
|
||||||
}
|
}
|
||||||
if (buf_top == 2) {
|
if (buf_top == 2) {
|
||||||
fputc(EncodeTable[buf[1] >> 2], fp);
|
PutChar(EncodeTable[buf[1] >> 2]);
|
||||||
fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
|
PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
|
||||||
fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
|
PutChar(EncodeTable[(buf[2] << 2) & 0x3F]);
|
||||||
fputc('=', fp);
|
PutChar('=');
|
||||||
}
|
}
|
||||||
buf_top = 0;
|
buf_top = 0;
|
||||||
if (endch != EOF) fputc(endch, fp);
|
if (endch != EOF) PutChar(endch);
|
||||||
|
this->Flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
FILE *fp;
|
IStream *fp;
|
||||||
int buf_top;
|
int buf_top;
|
||||||
unsigned char buf[4];
|
unsigned char buf[4];
|
||||||
|
std::string out_buf;
|
||||||
|
const static size_t kBufferSize = 256;
|
||||||
|
|
||||||
|
inline void PutChar(char ch) {
|
||||||
|
out_buf += ch;
|
||||||
|
if (out_buf.length() >= kBufferSize) Flush();
|
||||||
|
}
|
||||||
|
inline void Flush(void) {
|
||||||
|
fp->Write(BeginPtr(out_buf), out_buf.length());
|
||||||
|
out_buf.clear();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
} // namespace rabit
|
} // namespace rabit
|
||||||
#endif // RABIT_LEARN_UTILS_BASE64_H_
|
#endif // RABIT_LEARN_UTILS_BASE64_INL_H_
|
||||||
57
subtree/rabit/rabit-learn/io/buffer_reader-inl.h
Normal file
57
subtree/rabit/rabit-learn/io/buffer_reader-inl.h
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#ifndef RABIT_LEARN_IO_BUFFER_READER_INL_H_
|
||||||
|
#define RABIT_LEARN_IO_BUFFER_READER_INL_H_
|
||||||
|
/*!
|
||||||
|
* \file buffer_reader-inl.h
|
||||||
|
* \brief implementation of stream buffer reader
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include "./io.h"
|
||||||
|
|
||||||
|
namespace rabit {
|
||||||
|
namespace io {
|
||||||
|
/*! \brief buffer reader of the stream that allows you to get */
|
||||||
|
class StreamBufferReader {
|
||||||
|
public:
|
||||||
|
StreamBufferReader(size_t buffer_size)
|
||||||
|
:stream_(NULL),
|
||||||
|
read_len_(1), read_ptr_(1) {
|
||||||
|
buffer_.resize(buffer_size);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief set input stream
|
||||||
|
*/
|
||||||
|
inline void set_stream(IStream *stream) {
|
||||||
|
stream_ = stream;
|
||||||
|
read_len_ = read_ptr_ = 1;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief allows quick read using get char
|
||||||
|
*/
|
||||||
|
inline char GetChar(void) {
|
||||||
|
while (true) {
|
||||||
|
if (read_ptr_ < read_len_) {
|
||||||
|
return buffer_[read_ptr_++];
|
||||||
|
} else {
|
||||||
|
read_len_ = stream_->Read(&buffer_[0], buffer_.length());
|
||||||
|
if (read_len_ == 0) return EOF;
|
||||||
|
read_ptr_ = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline bool AtEnd(void) const {
|
||||||
|
return read_len_ == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/*! \brief the underlying stream */
|
||||||
|
IStream *stream_;
|
||||||
|
/*! \brief buffer to hold data */
|
||||||
|
std::string buffer_;
|
||||||
|
/*! \brief length of valid data in buffer */
|
||||||
|
size_t read_len_;
|
||||||
|
/*! \brief pointer in the buffer */
|
||||||
|
size_t read_ptr_;
|
||||||
|
};
|
||||||
|
} // namespace io
|
||||||
|
} // namespace rabit
|
||||||
|
#endif // RABIT_LEARN_IO_BUFFER_READER_INL_H_
|
||||||
106
subtree/rabit/rabit-learn/io/file-inl.h
Normal file
106
subtree/rabit/rabit-learn/io/file-inl.h
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#ifndef RABIT_LEARN_IO_FILE_INL_H_
|
||||||
|
#define RABIT_LEARN_IO_FILE_INL_H_
|
||||||
|
/*!
|
||||||
|
* \file file-inl.h
|
||||||
|
* \brief normal filesystem I/O
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdio>
|
||||||
|
#include "./io.h"
|
||||||
|
#include "./line_split-inl.h"
|
||||||
|
|
||||||
|
/*! \brief io interface */
|
||||||
|
namespace rabit {
|
||||||
|
namespace io {
|
||||||
|
/*! \brief implementation of file i/o stream */
|
||||||
|
class FileStream : public utils::ISeekStream {
|
||||||
|
public:
|
||||||
|
explicit FileStream(const char *fname, const char *mode)
|
||||||
|
: use_stdio(false) {
|
||||||
|
#ifndef RABIT_STRICT_CXX98_
|
||||||
|
if (!strcmp(fname, "stdin")) {
|
||||||
|
use_stdio = true; fp = stdin;
|
||||||
|
}
|
||||||
|
if (!strcmp(fname, "stdout")) {
|
||||||
|
use_stdio = true; fp = stdout;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (!strncmp(fname, "file://", 7)) fname += 7;
|
||||||
|
if (!use_stdio) {
|
||||||
|
std::string flag = mode;
|
||||||
|
if (flag == "w") flag = "wb";
|
||||||
|
if (flag == "r") flag = "rb";
|
||||||
|
fp = utils::FopenCheck(fname, flag.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual ~FileStream(void) {
|
||||||
|
this->Close();
|
||||||
|
}
|
||||||
|
virtual size_t Read(void *ptr, size_t size) {
|
||||||
|
return std::fread(ptr, 1, size, fp);
|
||||||
|
}
|
||||||
|
virtual void Write(const void *ptr, size_t size) {
|
||||||
|
std::fwrite(ptr, size, 1, fp);
|
||||||
|
}
|
||||||
|
virtual void Seek(size_t pos) {
|
||||||
|
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
|
||||||
|
}
|
||||||
|
virtual size_t Tell(void) {
|
||||||
|
return std::ftell(fp);
|
||||||
|
}
|
||||||
|
virtual bool AtEnd(void) const {
|
||||||
|
return feof(fp) != 0;
|
||||||
|
}
|
||||||
|
inline void Close(void) {
|
||||||
|
if (fp != NULL && !use_stdio) {
|
||||||
|
std::fclose(fp); fp = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
FILE *fp;
|
||||||
|
bool use_stdio;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief line split from normal file system */
|
||||||
|
class FileSplit : public LineSplitBase {
|
||||||
|
public:
|
||||||
|
explicit FileSplit(const char *uri, unsigned rank, unsigned nsplit) {
|
||||||
|
LineSplitBase::SplitNames(&fnames_, uri, "#");
|
||||||
|
std::vector<size_t> fsize;
|
||||||
|
for (size_t i = 0; i < fnames_.size(); ++i) {
|
||||||
|
if (!strncmp(fnames_[i].c_str(), "file://", 7)) {
|
||||||
|
std::string tmp = fnames_[i].c_str() + 7;
|
||||||
|
fnames_[i] = tmp;
|
||||||
|
}
|
||||||
|
fsize.push_back(GetFileSize(fnames_[i].c_str()));
|
||||||
|
}
|
||||||
|
LineSplitBase::Init(fsize, rank, nsplit);
|
||||||
|
}
|
||||||
|
virtual ~FileSplit(void) {}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual utils::ISeekStream *GetFile(size_t file_index) {
|
||||||
|
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
|
||||||
|
return new FileStream(fnames_[file_index].c_str(), "rb");
|
||||||
|
}
|
||||||
|
// get file size
|
||||||
|
inline static size_t GetFileSize(const char *fname) {
|
||||||
|
FILE *fp = utils::FopenCheck(fname, "rb");
|
||||||
|
// NOTE: fseek may not be good, but serves as ok solution
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
size_t fsize = static_cast<size_t>(ftell(fp));
|
||||||
|
fclose(fp);
|
||||||
|
return fsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// file names
|
||||||
|
std::vector<std::string> fnames_;
|
||||||
|
};
|
||||||
|
} // namespace io
|
||||||
|
} // namespace rabit
|
||||||
|
#endif // RABIT_LEARN_IO_FILE_INL_H_
|
||||||
|
|
||||||
140
subtree/rabit/rabit-learn/io/hdfs-inl.h
Normal file
140
subtree/rabit/rabit-learn/io/hdfs-inl.h
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
#ifndef RABIT_LEARN_IO_HDFS_INL_H_
|
||||||
|
#define RABIT_LEARN_IO_HDFS_INL_H_
|
||||||
|
/*!
|
||||||
|
* \file hdfs-inl.h
|
||||||
|
* \brief HDFS I/O
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <hdfs.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include "./io.h"
|
||||||
|
#include "./line_split-inl.h"
|
||||||
|
|
||||||
|
/*! \brief io interface */
|
||||||
|
namespace rabit {
|
||||||
|
namespace io {
|
||||||
|
class HDFSStream : public utils::ISeekStream {
|
||||||
|
public:
|
||||||
|
HDFSStream(hdfsFS fs, const char *fname, const char *mode)
|
||||||
|
: fs_(fs), at_end_(false) {
|
||||||
|
int flag;
|
||||||
|
if (!strcmp(mode, "r")) {
|
||||||
|
flag = O_RDONLY;
|
||||||
|
} else if (!strcmp(mode, "w")) {
|
||||||
|
flag = O_WRONLY;
|
||||||
|
} else if (!strcmp(mode, "a")) {
|
||||||
|
flag = O_WRONLY | O_APPEND;
|
||||||
|
} else {
|
||||||
|
utils::Error("HDFSStream: unknown flag %s", mode);
|
||||||
|
}
|
||||||
|
fp_ = hdfsOpenFile(fs_, fname, flag, 0, 0, 0);
|
||||||
|
utils::Check(fp_ != NULL,
|
||||||
|
"HDFSStream: fail to open %s", fname);
|
||||||
|
}
|
||||||
|
virtual ~HDFSStream(void) {
|
||||||
|
this->Close();
|
||||||
|
}
|
||||||
|
virtual size_t Read(void *ptr, size_t size) {
|
||||||
|
tSize nread = hdfsRead(fs_, fp_, ptr, size);
|
||||||
|
if (nread == -1) {
|
||||||
|
int errsv = errno;
|
||||||
|
utils::Error("HDFSStream.Read Error:%s", strerror(errsv));
|
||||||
|
}
|
||||||
|
if (nread == 0) {
|
||||||
|
at_end_ = true;
|
||||||
|
}
|
||||||
|
return static_cast<size_t>(nread);
|
||||||
|
}
|
||||||
|
virtual void Write(const void *ptr, size_t size) {
|
||||||
|
const char *buf = reinterpret_cast<const char*>(ptr);
|
||||||
|
while (size != 0) {
|
||||||
|
tSize nwrite = hdfsWrite(fs_, fp_, buf, size);
|
||||||
|
if (nwrite == -1) {
|
||||||
|
int errsv = errno;
|
||||||
|
utils::Error("HDFSStream.Write Error:%s", strerror(errsv));
|
||||||
|
}
|
||||||
|
size_t sz = static_cast<size_t>(nwrite);
|
||||||
|
buf += sz; size -= sz;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual void Seek(size_t pos) {
|
||||||
|
if (hdfsSeek(fs_, fp_, pos) != 0) {
|
||||||
|
int errsv = errno;
|
||||||
|
utils::Error("HDFSStream.Seek Error:%s", strerror(errsv));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual size_t Tell(void) {
|
||||||
|
tOffset offset = hdfsTell(fs_, fp_);
|
||||||
|
if (offset == -1) {
|
||||||
|
int errsv = errno;
|
||||||
|
utils::Error("HDFSStream.Tell Error:%s", strerror(errsv));
|
||||||
|
}
|
||||||
|
return static_cast<size_t>(offset);
|
||||||
|
}
|
||||||
|
virtual bool AtEnd(void) const {
|
||||||
|
return at_end_;
|
||||||
|
}
|
||||||
|
inline void Close(void) {
|
||||||
|
if (fp_ != NULL) {
|
||||||
|
if (hdfsCloseFile(fs_, fp_) == -1) {
|
||||||
|
int errsv = errno;
|
||||||
|
utils::Error("HDFSStream.Close Error:%s", strerror(errsv));
|
||||||
|
}
|
||||||
|
fp_ = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
hdfsFS fs_;
|
||||||
|
hdfsFile fp_;
|
||||||
|
bool at_end_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief line split from normal file system */
|
||||||
|
class HDFSSplit : public LineSplitBase {
|
||||||
|
public:
|
||||||
|
explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
|
||||||
|
fs_ = hdfsConnect("default", 0);
|
||||||
|
std::vector<std::string> paths;
|
||||||
|
LineSplitBase::SplitNames(&paths, uri, "#");
|
||||||
|
// get the files
|
||||||
|
std::vector<size_t> fsize;
|
||||||
|
for (size_t i = 0; i < paths.size(); ++i) {
|
||||||
|
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
|
||||||
|
if (info->mKind == 'D') {
|
||||||
|
int nentry;
|
||||||
|
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
|
||||||
|
for (int i = 0; i < nentry; ++i) {
|
||||||
|
if (files[i].mKind == 'F') {
|
||||||
|
fsize.push_back(files[i].mSize);
|
||||||
|
fnames_.push_back(std::string(files[i].mName));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
hdfsFreeFileInfo(files, nentry);
|
||||||
|
} else {
|
||||||
|
fsize.push_back(info->mSize);
|
||||||
|
fnames_.push_back(std::string(info->mName));
|
||||||
|
}
|
||||||
|
hdfsFreeFileInfo(info, 1);
|
||||||
|
}
|
||||||
|
LineSplitBase::Init(fsize, rank, nsplit);
|
||||||
|
}
|
||||||
|
virtual ~HDFSSplit(void) {}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual utils::ISeekStream *GetFile(size_t file_index) {
|
||||||
|
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
|
||||||
|
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r");
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// hdfs handle
|
||||||
|
hdfsFS fs_;
|
||||||
|
// file names
|
||||||
|
std::vector<std::string> fnames_;
|
||||||
|
};
|
||||||
|
} // namespace io
|
||||||
|
} // namespace rabit
|
||||||
|
#endif // RABIT_LEARN_IO_HDFS_INL_H_
|
||||||
65
subtree/rabit/rabit-learn/io/io-inl.h
Normal file
65
subtree/rabit/rabit-learn/io/io-inl.h
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#ifndef RABIT_LEARN_IO_IO_INL_H_
|
||||||
|
#define RABIT_LEARN_IO_IO_INL_H_
|
||||||
|
/*!
|
||||||
|
* \file io-inl.h
|
||||||
|
* \brief Input/Output utils that handles read/write
|
||||||
|
* of files in distrubuted enviroment
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "./io.h"
|
||||||
|
#if RABIT_USE_HDFS
|
||||||
|
#include "./hdfs-inl.h"
|
||||||
|
#endif
|
||||||
|
#include "./file-inl.h"
|
||||||
|
|
||||||
|
namespace rabit {
|
||||||
|
namespace io {
|
||||||
|
/*!
|
||||||
|
* \brief create input split given a uri
|
||||||
|
* \param uri the uri of the input, can contain hdfs prefix
|
||||||
|
* \param part the part id of current input
|
||||||
|
* \param nsplit total number of splits
|
||||||
|
*/
|
||||||
|
inline InputSplit *CreateInputSplit(const char *uri,
|
||||||
|
unsigned part,
|
||||||
|
unsigned nsplit) {
|
||||||
|
if (!strcmp(uri, "stdin")) {
|
||||||
|
return new SingleFileSplit(uri);
|
||||||
|
}
|
||||||
|
if (!strncmp(uri, "file://", 7)) {
|
||||||
|
return new FileSplit(uri, part, nsplit);
|
||||||
|
}
|
||||||
|
if (!strncmp(uri, "hdfs://", 7)) {
|
||||||
|
#if RABIT_USE_HDFS
|
||||||
|
return new HDFSSplit(uri, part, nsplit);
|
||||||
|
#else
|
||||||
|
utils::Error("Please compile with RABIT_USE_HDFS=1");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return new FileSplit(uri, part, nsplit);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief create an stream, the stream must be able to close
|
||||||
|
* the underlying resources(files) when deleted
|
||||||
|
*
|
||||||
|
* \param uri the uri of the input, can contain hdfs prefix
|
||||||
|
* \param mode can be 'w' or 'r' for read or write
|
||||||
|
*/
|
||||||
|
inline IStream *CreateStream(const char *uri, const char *mode) {
|
||||||
|
if (!strncmp(uri, "file://", 7)) {
|
||||||
|
return new FileStream(uri + 7, mode);
|
||||||
|
}
|
||||||
|
if (!strncmp(uri, "hdfs://", 7)) {
|
||||||
|
#if RABIT_USE_HDFS
|
||||||
|
return new HDFSStream(hdfsConnect("default", 0), uri, mode);
|
||||||
|
#else
|
||||||
|
utils::Error("Please compile with RABIT_USE_HDFS=1");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return new FileStream(uri, mode);
|
||||||
|
}
|
||||||
|
} // namespace io
|
||||||
|
} // namespace rabit
|
||||||
|
#endif // RABIT_LEARN_IO_IO_INL_H_
|
||||||
61
subtree/rabit/rabit-learn/io/io.h
Normal file
61
subtree/rabit/rabit-learn/io/io.h
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#ifndef RABIT_LEARN_IO_IO_H_
|
||||||
|
#define RABIT_LEARN_IO_IO_H_
|
||||||
|
/*!
|
||||||
|
* \file io.h
|
||||||
|
* \brief Input/Output utils that handles read/write
|
||||||
|
* of files in distrubuted enviroment
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include "../../include/rabit_serializable.h"
|
||||||
|
|
||||||
|
/*! \brief whether compile with HDFS support */
|
||||||
|
#ifndef RABIT_USE_HDFS
|
||||||
|
#define RABIT_USE_HDFS 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*! \brief io interface */
|
||||||
|
namespace rabit {
|
||||||
|
/*!
|
||||||
|
* \brief namespace to handle input split and filesystem interfacing
|
||||||
|
*/
|
||||||
|
namespace io {
|
||||||
|
typedef utils::ISeekStream ISeekStream;
|
||||||
|
/*!
|
||||||
|
* \brief user facing input split helper,
|
||||||
|
* can be used to get the partition of data used by current node
|
||||||
|
*/
|
||||||
|
class InputSplit {
|
||||||
|
public:
|
||||||
|
/*!
|
||||||
|
* \brief get next line, store into out_data
|
||||||
|
* \param out_data the string that stores the line data,
|
||||||
|
* \n is not included
|
||||||
|
* \return true of next line was found, false if we read all the lines
|
||||||
|
*/
|
||||||
|
virtual bool NextLine(std::string *out_data) = 0;
|
||||||
|
/*! \brief destructor*/
|
||||||
|
virtual ~InputSplit(void) {}
|
||||||
|
};
|
||||||
|
/*!
|
||||||
|
* \brief create input split given a uri
|
||||||
|
* \param uri the uri of the input, can contain hdfs prefix
|
||||||
|
* \param part the part id of current input
|
||||||
|
* \param nsplit total number of splits
|
||||||
|
*/
|
||||||
|
inline InputSplit *CreateInputSplit(const char *uri,
|
||||||
|
unsigned part,
|
||||||
|
unsigned nsplit);
|
||||||
|
/*!
|
||||||
|
* \brief create an stream, the stream must be able to close
|
||||||
|
* the underlying resources(files) when deleted
|
||||||
|
*
|
||||||
|
* \param uri the uri of the input, can contain hdfs prefix
|
||||||
|
* \param mode can be 'w' or 'r' for read or write
|
||||||
|
*/
|
||||||
|
inline IStream *CreateStream(const char *uri, const char *mode);
|
||||||
|
} // namespace io
|
||||||
|
} // namespace rabit
|
||||||
|
|
||||||
|
#include "./io-inl.h"
|
||||||
|
#include "./base64-inl.h"
|
||||||
|
#endif // RABIT_LEARN_IO_IO_H_
|
||||||
181
subtree/rabit/rabit-learn/io/line_split-inl.h
Normal file
181
subtree/rabit/rabit-learn/io/line_split-inl.h
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||||
|
#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||||
|
/*!
|
||||||
|
* \file line_split-inl.h
|
||||||
|
* \brief base implementation of line-spliter
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
|
#include "../../include/rabit.h"
|
||||||
|
#include "./io.h"
|
||||||
|
#include "./buffer_reader-inl.h"
|
||||||
|
|
||||||
|
namespace rabit {
|
||||||
|
namespace io {
|
||||||
|
class LineSplitBase : public InputSplit {
|
||||||
|
public:
|
||||||
|
virtual ~LineSplitBase() {
|
||||||
|
if (fs_ != NULL) delete fs_;
|
||||||
|
}
|
||||||
|
virtual bool NextLine(std::string *out_data) {
|
||||||
|
if (file_ptr_ >= file_ptr_end_ &&
|
||||||
|
offset_curr_ >= offset_end_) return false;
|
||||||
|
out_data->clear();
|
||||||
|
while (true) {
|
||||||
|
char c = reader_.GetChar();
|
||||||
|
if (reader_.AtEnd()) {
|
||||||
|
if (out_data->length() != 0) return true;
|
||||||
|
file_ptr_ += 1;
|
||||||
|
if (offset_curr_ != file_offset_[file_ptr_]) {
|
||||||
|
utils::Error("warning:file size not calculated correctly\n");
|
||||||
|
offset_curr_ = file_offset_[file_ptr_];
|
||||||
|
}
|
||||||
|
if (offset_curr_ >= offset_end_) return false;
|
||||||
|
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
|
||||||
|
"boundary check");
|
||||||
|
delete fs_;
|
||||||
|
fs_ = this->GetFile(file_ptr_);
|
||||||
|
reader_.set_stream(fs_);
|
||||||
|
} else {
|
||||||
|
++offset_curr_;
|
||||||
|
if (c != '\r' && c != '\n' && c != EOF) {
|
||||||
|
*out_data += c;
|
||||||
|
} else {
|
||||||
|
if (out_data->length() != 0) return true;
|
||||||
|
if (file_ptr_ >= file_ptr_end_ &&
|
||||||
|
offset_curr_ >= offset_end_) return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// constructor
|
||||||
|
LineSplitBase(void)
|
||||||
|
: fs_(NULL), reader_(kBufferSize) {
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief initialize the line spliter,
|
||||||
|
* \param file_size, size of each files
|
||||||
|
* \param rank the current rank of the data
|
||||||
|
* \param nsplit number of split we will divide the data into
|
||||||
|
*/
|
||||||
|
inline void Init(const std::vector<size_t> &file_size,
|
||||||
|
unsigned rank, unsigned nsplit) {
|
||||||
|
file_offset_.resize(file_size.size() + 1);
|
||||||
|
file_offset_[0] = 0;
|
||||||
|
for (size_t i = 0; i < file_size.size(); ++i) {
|
||||||
|
file_offset_[i + 1] = file_offset_[i] + file_size[i];
|
||||||
|
}
|
||||||
|
size_t ntotal = file_offset_.back();
|
||||||
|
size_t nstep = (ntotal + nsplit - 1) / nsplit;
|
||||||
|
offset_begin_ = std::min(nstep * rank, ntotal);
|
||||||
|
offset_end_ = std::min(nstep * (rank + 1), ntotal);
|
||||||
|
offset_curr_ = offset_begin_;
|
||||||
|
if (offset_begin_ == offset_end_) return;
|
||||||
|
file_ptr_ = std::upper_bound(file_offset_.begin(),
|
||||||
|
file_offset_.end(),
|
||||||
|
offset_begin_) - file_offset_.begin() - 1;
|
||||||
|
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
|
||||||
|
file_offset_.end(),
|
||||||
|
offset_end_) - file_offset_.begin() - 1;
|
||||||
|
fs_ = GetFile(file_ptr_);
|
||||||
|
reader_.set_stream(fs_);
|
||||||
|
// try to set the starting position correctly
|
||||||
|
if (file_offset_[file_ptr_] != offset_begin_) {
|
||||||
|
fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
|
||||||
|
while (true) {
|
||||||
|
char c = reader_.GetChar();
|
||||||
|
if (!reader_.AtEnd()) ++offset_curr_;
|
||||||
|
if (c == '\n' || c == '\r' || c == EOF) return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief get the seek stream of given file_index
|
||||||
|
* \return the corresponding seek stream at head of file
|
||||||
|
*/
|
||||||
|
virtual utils::ISeekStream *GetFile(size_t file_index) = 0;
|
||||||
|
/*!
|
||||||
|
* \brief split names given
|
||||||
|
* \param out_fname output file names
|
||||||
|
* \param uri_ the iput uri file
|
||||||
|
* \param dlm deliminetr
|
||||||
|
*/
|
||||||
|
inline static void SplitNames(std::vector<std::string> *out_fname,
|
||||||
|
const char *uri_,
|
||||||
|
const char *dlm) {
|
||||||
|
std::string uri = uri_;
|
||||||
|
char *p = strtok(BeginPtr(uri), dlm);
|
||||||
|
while (p != NULL) {
|
||||||
|
out_fname->push_back(std::string(p));
|
||||||
|
p = strtok(NULL, dlm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
/*! \brief current input stream */
|
||||||
|
utils::ISeekStream *fs_;
|
||||||
|
/*! \brief file pointer of which file to read on */
|
||||||
|
size_t file_ptr_;
|
||||||
|
/*! \brief file pointer where the end of file lies */
|
||||||
|
size_t file_ptr_end_;
|
||||||
|
/*! \brief get the current offset */
|
||||||
|
size_t offset_curr_;
|
||||||
|
/*! \brief beginning of offset */
|
||||||
|
size_t offset_begin_;
|
||||||
|
/*! \brief end of the offset */
|
||||||
|
size_t offset_end_;
|
||||||
|
/*! \brief byte-offset of each file */
|
||||||
|
std::vector<size_t> file_offset_;
|
||||||
|
/*! \brief buffer reader */
|
||||||
|
StreamBufferReader reader_;
|
||||||
|
/*! \brief buffer size */
|
||||||
|
const static size_t kBufferSize = 256;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*! \brief line split from single file */
|
||||||
|
class SingleFileSplit : public InputSplit {
|
||||||
|
public:
|
||||||
|
explicit SingleFileSplit(const char *fname) {
|
||||||
|
if (!strcmp(fname, "stdin")) {
|
||||||
|
#ifndef RABIT_STRICT_CXX98_
|
||||||
|
use_stdin_ = true; fp_ = stdin;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (!use_stdin_) {
|
||||||
|
fp_ = utils::FopenCheck(fname, "r");
|
||||||
|
}
|
||||||
|
end_of_file_ = false;
|
||||||
|
}
|
||||||
|
virtual ~SingleFileSplit(void) {
|
||||||
|
if (!use_stdin_) fclose(fp_);
|
||||||
|
}
|
||||||
|
virtual bool NextLine(std::string *out_data) {
|
||||||
|
if (end_of_file_) return false;
|
||||||
|
out_data->clear();
|
||||||
|
while (true) {
|
||||||
|
char c = fgetc(fp_);
|
||||||
|
if (c == EOF) {
|
||||||
|
end_of_file_ = true;
|
||||||
|
}
|
||||||
|
if (c != '\r' && c != '\n' && c != EOF) {
|
||||||
|
*out_data += c;
|
||||||
|
} else {
|
||||||
|
if (out_data->length() != 0) return true;
|
||||||
|
if (end_of_file_) return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
FILE *fp_;
|
||||||
|
bool use_stdin_;
|
||||||
|
bool end_of_file_;
|
||||||
|
};
|
||||||
|
} // namespace io
|
||||||
|
} // namespace rabit
|
||||||
|
#endif // RABIT_LEARN_IO_LINE_SPLIT_INL_H_
|
||||||
@ -6,11 +6,10 @@ MPIBIN = kmeans.mpi
|
|||||||
OBJ = kmeans.o
|
OBJ = kmeans.o
|
||||||
|
|
||||||
# common build script for programs
|
# common build script for programs
|
||||||
include ../common.mk
|
include ../make/common.mk
|
||||||
|
|
||||||
# dependenies here
|
# dependenies here
|
||||||
kmeans.rabit: kmeans.o lib
|
kmeans.rabit: kmeans.o lib
|
||||||
kmeans.mock: kmeans.o lib
|
kmeans.mock: kmeans.o lib
|
||||||
kmeans.mpi: kmeans.o libmpi
|
kmeans.mpi: kmeans.o libmpi
|
||||||
kmeans.o: kmeans.cc ../../src/*.h
|
kmeans.o: kmeans.cc ../../src/*.h
|
||||||
|
|
||||||
|
|||||||
2
subtree/rabit/rabit-learn/linear/.gitignore
vendored
Normal file
2
subtree/rabit/rabit-learn/linear/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
mushroom.row*
|
||||||
|
*.model
|
||||||
@ -6,7 +6,8 @@ MPIBIN =
|
|||||||
OBJ = linear.o
|
OBJ = linear.o
|
||||||
|
|
||||||
# common build script for programs
|
# common build script for programs
|
||||||
include ../common.mk
|
include ../make/config.mk
|
||||||
|
include ../make/common.mk
|
||||||
CFLAGS+=-fopenmp
|
CFLAGS+=-fopenmp
|
||||||
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
|
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
|
||||||
# dependenies here
|
# dependenies here
|
||||||
|
|||||||
@ -2,11 +2,24 @@ Linear and Logistic Regression
|
|||||||
====
|
====
|
||||||
* input format: LibSVM
|
* input format: LibSVM
|
||||||
* Local Example: [run-linear.sh](run-linear.sh)
|
* Local Example: [run-linear.sh](run-linear.sh)
|
||||||
* Runnig on Hadoop: [run-hadoop.sh](run-hadoop.sh)
|
* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
|
||||||
- Set input data to stdin, and model_out=stdout
|
- You will need to have YARN
|
||||||
|
- Modify ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
|
||||||
|
- Run build.sh on [../../yarn](../../yarn) on to build yarn jar file
|
||||||
|
|
||||||
|
Multi-Threading Optimization
|
||||||
|
====
|
||||||
|
* The code can be multi-threaded, we encourage you to use it
|
||||||
|
- Simply add ```nthread=k``` where k is the number of threads you want to use
|
||||||
|
* If you submit with YARN
|
||||||
|
- Use ```--vcores``` and ```-mem``` to request CPU and memory resources
|
||||||
|
- Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
|
||||||
|
* Usually multi-threading improves speed in general
|
||||||
|
- You can use less workers and assign more resources to each of worker
|
||||||
|
- This usually means less communication overhead and faster running time
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
===
|
====
|
||||||
All the parameters can be set by param=value
|
All the parameters can be set by param=value
|
||||||
|
|
||||||
#### Important Parameters
|
#### Important Parameters
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
#include "./linear.h"
|
#include "./linear.h"
|
||||||
#include "../utils/io.h"
|
#include "../io/io.h"
|
||||||
#include "../utils/base64.h"
|
|
||||||
|
|
||||||
namespace rabit {
|
namespace rabit {
|
||||||
namespace linear {
|
namespace linear {
|
||||||
@ -55,7 +54,9 @@ class LinearObjFunction : public solver::IObjFunction<float> {
|
|||||||
}
|
}
|
||||||
if (task == "train") {
|
if (task == "train") {
|
||||||
lbfgs.Run();
|
lbfgs.Run();
|
||||||
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
|
if (rabit::GetRank() == 0) {
|
||||||
|
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
|
||||||
|
}
|
||||||
} else if (task == "pred") {
|
} else if (task == "pred") {
|
||||||
this->TaskPred();
|
this->TaskPred();
|
||||||
} else {
|
} else {
|
||||||
@ -74,51 +75,37 @@ class LinearObjFunction : public solver::IObjFunction<float> {
|
|||||||
printf("Finishing writing to %s\n", name_pred.c_str());
|
printf("Finishing writing to %s\n", name_pred.c_str());
|
||||||
}
|
}
|
||||||
inline void LoadModel(const char *fname) {
|
inline void LoadModel(const char *fname) {
|
||||||
FILE *fp = utils::FopenCheck(fname, "rb");
|
IStream *fi = io::CreateStream(fname, "r");
|
||||||
std::string header; header.resize(4);
|
std::string header; header.resize(4);
|
||||||
// check header for different binary encode
|
// check header for different binary encode
|
||||||
// can be base64 or binary
|
// can be base64 or binary
|
||||||
utils::FileStream fi(fp);
|
utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
|
||||||
utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
|
// base64 format
|
||||||
// base64 format
|
|
||||||
if (header == "bs64") {
|
if (header == "bs64") {
|
||||||
utils::Base64InStream bsin(fp);
|
io::Base64InStream bsin(fi);
|
||||||
bsin.InitPosition();
|
bsin.InitPosition();
|
||||||
model.Load(bsin);
|
model.Load(bsin);
|
||||||
fclose(fp);
|
|
||||||
return;
|
|
||||||
} else if (header == "binf") {
|
} else if (header == "binf") {
|
||||||
model.Load(fi);
|
model.Load(*fi);
|
||||||
fclose(fp);
|
|
||||||
return;
|
|
||||||
} else {
|
} else {
|
||||||
utils::Error("invalid model file");
|
utils::Error("invalid model file");
|
||||||
}
|
}
|
||||||
|
delete fi;
|
||||||
}
|
}
|
||||||
inline void SaveModel(const char *fname,
|
inline void SaveModel(const char *fname,
|
||||||
const float *wptr,
|
const float *wptr,
|
||||||
bool save_base64 = false) {
|
bool save_base64 = false) {
|
||||||
FILE *fp;
|
IStream *fo = io::CreateStream(fname, "w");
|
||||||
bool use_stdout = false;
|
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
|
||||||
if (!strcmp(fname, "stdout")) {
|
fo->Write("bs64\t", 5);
|
||||||
fp = stdout;
|
io::Base64OutStream bout(fo);
|
||||||
use_stdout = true;
|
|
||||||
} else {
|
|
||||||
fp = utils::FopenCheck(fname, "wb");
|
|
||||||
}
|
|
||||||
utils::FileStream fo(fp);
|
|
||||||
if (save_base64 != 0|| use_stdout) {
|
|
||||||
fo.Write("bs64\t", 5);
|
|
||||||
utils::Base64OutStream bout(fp);
|
|
||||||
model.Save(bout, wptr);
|
model.Save(bout, wptr);
|
||||||
bout.Finish('\n');
|
bout.Finish('\n');
|
||||||
} else {
|
} else {
|
||||||
fo.Write("binf", 4);
|
fo->Write("binf", 4);
|
||||||
model.Save(fo, wptr);
|
model.Save(*fo, wptr);
|
||||||
}
|
|
||||||
if (!use_stdout) {
|
|
||||||
fclose(fp);
|
|
||||||
}
|
}
|
||||||
|
delete fo;
|
||||||
}
|
}
|
||||||
inline void LoadData(const char *fname) {
|
inline void LoadData(const char *fname) {
|
||||||
dtrain.Load(fname);
|
dtrain.Load(fname);
|
||||||
|
|||||||
@ -12,7 +12,7 @@ hadoop fs -mkdir $2/data
|
|||||||
hadoop fs -put ../data/agaricus.txt.train $2/data
|
hadoop fs -put ../data/agaricus.txt.train $2/data
|
||||||
|
|
||||||
# submit to hadoop
|
# submit to hadoop
|
||||||
../../tracker/rabit_hadoop.py --host_ip ip -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
||||||
|
|
||||||
# get the final model file
|
# get the final model file
|
||||||
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
|
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
|
||||||
@ -5,11 +5,7 @@ then
|
|||||||
exit -1
|
exit -1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -rf mushroom.row* *.model
|
rm -rf *.model
|
||||||
k=$1
|
k=$1
|
||||||
|
|
||||||
# split the lib svm file into k subfiles
|
../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1
|
||||||
python splitrows.py ../data/agaricus.txt.train mushroom $k
|
|
||||||
|
|
||||||
# run xgboost mpi
|
|
||||||
../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1
|
|
||||||
|
|||||||
@ -5,13 +5,10 @@ then
|
|||||||
exit -1
|
exit -1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -rf mushroom.row* *.model
|
rm -rf *.model
|
||||||
k=$1
|
k=$1
|
||||||
|
|
||||||
# split the lib svm file into k subfiles
|
# run linear model, the program will automatically split the inputs
|
||||||
python splitrows.py ../data/agaricus.txt.train mushroom $k
|
../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1
|
||||||
|
|
||||||
# run xgboost mpi
|
|
||||||
../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
|
|
||||||
|
|
||||||
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
|
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
|
||||||
|
|||||||
19
subtree/rabit/rabit-learn/linear/run-yarn.sh
Executable file
19
subtree/rabit/rabit-learn/linear/run-yarn.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [ "$#" -lt 3 ];
|
||||||
|
then
|
||||||
|
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# put the local training file to HDFS
|
||||||
|
hadoop fs -rm -r -f $2/data
|
||||||
|
hadoop fs -rm -r -f $2/mushroom.linear.model
|
||||||
|
hadoop fs -mkdir $2/data
|
||||||
|
|
||||||
|
# submit to hadoop
|
||||||
|
../../tracker/rabit_yarn.py -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
|
||||||
|
|
||||||
|
# get the final model file
|
||||||
|
hadoop fs -get $2/mushroom.linear.model ./linear.model
|
||||||
|
|
||||||
|
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
|
||||||
@ -1,24 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
|
||||||
import random
|
|
||||||
|
|
||||||
# split libsvm file into different rows
|
|
||||||
if len(sys.argv) < 4:
|
|
||||||
print ('Usage:<fin> <fo> k')
|
|
||||||
exit(0)
|
|
||||||
|
|
||||||
random.seed(10)
|
|
||||||
|
|
||||||
k = int(sys.argv[3])
|
|
||||||
fi = open( sys.argv[1], 'r' )
|
|
||||||
fos = []
|
|
||||||
|
|
||||||
for i in range(k):
|
|
||||||
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
|
|
||||||
|
|
||||||
for l in open(sys.argv[1]):
|
|
||||||
i = random.randint(0, k-1)
|
|
||||||
fos[i].write(l)
|
|
||||||
|
|
||||||
for f in fos:
|
|
||||||
f.close()
|
|
||||||
@ -1,13 +1,20 @@
|
|||||||
# this is the common build script for rabit programs
|
# this is the common build script for rabit programs
|
||||||
# you do not have to use it
|
# you do not have to use it
|
||||||
export CC = gcc
|
export LDFLAGS= -L../../lib -pthread -lm -lrt
|
||||||
export CXX = g++
|
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fPIC -I../../include
|
||||||
export MPICXX = mpicxx
|
|
||||||
export LDFLAGS= -pthread -lm -L../../lib
|
# setup opencv
|
||||||
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fPIC -I../../include
|
ifeq ($(USE_HDFS),1)
|
||||||
|
CFLAGS+= -DRABIT_USE_HDFS=1 -I$(HADOOP_HDFS_HOME)/include -I$(JAVA_HOME)/include
|
||||||
|
LDFLAGS+= -L$(HADOOP_HDFS_HOME)/lib/native -L$(LIBJVM) -lhdfs -ljvm
|
||||||
|
else
|
||||||
|
CFLAGS+= -DRABIT_USE_HDFS=0
|
||||||
|
endif
|
||||||
|
|
||||||
.PHONY: clean all lib mpi
|
.PHONY: clean all lib mpi
|
||||||
|
|
||||||
all: $(BIN) $(MOCKBIN)
|
all: $(BIN) $(MOCKBIN)
|
||||||
|
|
||||||
mpi: $(MPIBIN)
|
mpi: $(MPIBIN)
|
||||||
|
|
||||||
lib:
|
lib:
|
||||||
@ -15,10 +22,12 @@ lib:
|
|||||||
libmpi:
|
libmpi:
|
||||||
cd ../..;make lib/librabit_mpi.a;cd -
|
cd ../..;make lib/librabit_mpi.a;cd -
|
||||||
|
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) $(LDFLAGS) -lrabit
|
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit $(LDFLAGS)
|
||||||
|
|
||||||
$(MOCKBIN) :
|
$(MOCKBIN) :
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) $(LDFLAGS) -lrabit_mock
|
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit_mock $(LDFLAGS)
|
||||||
|
|
||||||
$(OBJ) :
|
$(OBJ) :
|
||||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||||
21
subtree/rabit/rabit-learn/make/config.mk
Normal file
21
subtree/rabit/rabit-learn/make/config.mk
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
#-----------------------------------------------------
|
||||||
|
# rabit-learn: the configuration compile script
|
||||||
|
#
|
||||||
|
# This is the default configuration setup for rabit-learn
|
||||||
|
# If you want to change configuration, do the following steps:
|
||||||
|
#
|
||||||
|
# - copy this file to the root of rabit-learn folder
|
||||||
|
# - modify the configuration you want
|
||||||
|
# - type make or make -j n for parallel build
|
||||||
|
#----------------------------------------------------
|
||||||
|
|
||||||
|
# choice of compiler
|
||||||
|
export CC = gcc
|
||||||
|
export CXX = g++
|
||||||
|
export MPICXX = mpicxx
|
||||||
|
|
||||||
|
# whether use HDFS support during compile
|
||||||
|
USE_HDFS = 1
|
||||||
|
|
||||||
|
# path to libjvm.so
|
||||||
|
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
|
||||||
@ -14,7 +14,9 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <sstream>
|
||||||
#include <rabit.h>
|
#include <rabit.h>
|
||||||
|
#include "../io/io.h"
|
||||||
|
|
||||||
namespace rabit {
|
namespace rabit {
|
||||||
// typedef index type
|
// typedef index type
|
||||||
@ -45,49 +47,37 @@ struct SparseMat {
|
|||||||
}
|
}
|
||||||
// load data from LibSVM format
|
// load data from LibSVM format
|
||||||
inline void Load(const char *fname) {
|
inline void Load(const char *fname) {
|
||||||
FILE *fi;
|
io::InputSplit *in =
|
||||||
if (!strcmp(fname, "stdin")) {
|
io::CreateInputSplit
|
||||||
fi = stdin;
|
(fname, rabit::GetRank(),
|
||||||
} else {
|
rabit::GetWorldSize());
|
||||||
if (strchr(fname, '%') != NULL) {
|
|
||||||
char s_tmp[256];
|
|
||||||
snprintf(s_tmp, sizeof(s_tmp), fname, rabit::GetRank());
|
|
||||||
fi = utils::FopenCheck(s_tmp, "r");
|
|
||||||
} else {
|
|
||||||
fi = utils::FopenCheck(fname, "r");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
row_ptr.clear();
|
row_ptr.clear();
|
||||||
row_ptr.push_back(0);
|
row_ptr.push_back(0);
|
||||||
data.clear();
|
data.clear();
|
||||||
feat_dim = 0;
|
feat_dim = 0;
|
||||||
float label; bool init = true;
|
std::string line;
|
||||||
char tmp[1024];
|
while (in->NextLine(&line)) {
|
||||||
while (fscanf(fi, "%s", tmp) == 1) {
|
float label;
|
||||||
|
std::istringstream ss(line);
|
||||||
|
ss >> label;
|
||||||
Entry e;
|
Entry e;
|
||||||
unsigned long fidx;
|
unsigned long fidx;
|
||||||
if (sscanf(tmp, "%lu:%f", &fidx, &e.fvalue) == 2) {
|
while (!ss.eof()) {
|
||||||
|
if (!(ss >> fidx)) break;
|
||||||
|
ss.ignore(32, ':');
|
||||||
|
if (!(ss >> e.fvalue)) break;
|
||||||
e.findex = static_cast<index_t>(fidx);
|
e.findex = static_cast<index_t>(fidx);
|
||||||
data.push_back(e);
|
data.push_back(e);
|
||||||
feat_dim = std::max(fidx, feat_dim);
|
feat_dim = std::max(fidx, feat_dim);
|
||||||
} else {
|
|
||||||
if (!init) {
|
|
||||||
labels.push_back(label);
|
|
||||||
row_ptr.push_back(data.size());
|
|
||||||
}
|
|
||||||
utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
|
|
||||||
init = false;
|
|
||||||
}
|
}
|
||||||
|
labels.push_back(label);
|
||||||
|
row_ptr.push_back(data.size());
|
||||||
}
|
}
|
||||||
// last row
|
delete in;
|
||||||
labels.push_back(label);
|
|
||||||
row_ptr.push_back(data.size());
|
|
||||||
feat_dim += 1;
|
feat_dim += 1;
|
||||||
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
|
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
|
||||||
"feature dimension exceed limit of index_t"\
|
"feature dimension exceed limit of index_t"\
|
||||||
"consider change the index_t to unsigned long");
|
"consider change the index_t to unsigned long");
|
||||||
// close the filed
|
|
||||||
if (fi != stdin) fclose(fi);
|
|
||||||
}
|
}
|
||||||
inline size_t NumRow(void) const {
|
inline size_t NumRow(void) const {
|
||||||
return row_ptr.size() - 1;
|
return row_ptr.size() - 1;
|
||||||
@ -98,6 +88,7 @@ struct SparseMat {
|
|||||||
std::vector<Entry> data;
|
std::vector<Entry> data;
|
||||||
std::vector<float> labels;
|
std::vector<float> labels;
|
||||||
};
|
};
|
||||||
|
|
||||||
// dense matrix
|
// dense matrix
|
||||||
struct Matrix {
|
struct Matrix {
|
||||||
inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
|
inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
|
||||||
|
|||||||
@ -1,40 +0,0 @@
|
|||||||
#ifndef RABIT_LEARN_UTILS_IO_H_
|
|
||||||
#define RABIT_LEARN_UTILS_IO_H_
|
|
||||||
/*!
|
|
||||||
* \file io.h
|
|
||||||
* \brief additional stream interface
|
|
||||||
* \author Tianqi Chen
|
|
||||||
*/
|
|
||||||
namespace rabit {
|
|
||||||
namespace utils {
|
|
||||||
/*! \brief implementation of file i/o stream */
|
|
||||||
class FileStream : public ISeekStream {
|
|
||||||
public:
|
|
||||||
explicit FileStream(FILE *fp) : fp(fp) {}
|
|
||||||
explicit FileStream(void) {
|
|
||||||
this->fp = NULL;
|
|
||||||
}
|
|
||||||
virtual size_t Read(void *ptr, size_t size) {
|
|
||||||
return std::fread(ptr, size, 1, fp);
|
|
||||||
}
|
|
||||||
virtual void Write(const void *ptr, size_t size) {
|
|
||||||
std::fwrite(ptr, size, 1, fp);
|
|
||||||
}
|
|
||||||
virtual void Seek(size_t pos) {
|
|
||||||
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
|
|
||||||
}
|
|
||||||
virtual size_t Tell(void) {
|
|
||||||
return std::ftell(fp);
|
|
||||||
}
|
|
||||||
inline void Close(void) {
|
|
||||||
if (fp != NULL){
|
|
||||||
std::fclose(fp); fp = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
FILE *fp;
|
|
||||||
};
|
|
||||||
} // namespace utils
|
|
||||||
} // namespace rabit
|
|
||||||
#endif // RABIT_LEARN_UTILS_IO_H_
|
|
||||||
@ -29,11 +29,24 @@ AllreduceBase::AllreduceBase(void) {
|
|||||||
task_id = "NULL";
|
task_id = "NULL";
|
||||||
err_link = NULL;
|
err_link = NULL;
|
||||||
this->SetParam("rabit_reduce_buffer", "256MB");
|
this->SetParam("rabit_reduce_buffer", "256MB");
|
||||||
|
// setup possible enviroment variable of intrest
|
||||||
|
env_vars.push_back("rabit_task_id");
|
||||||
|
env_vars.push_back("rabit_num_trial");
|
||||||
|
env_vars.push_back("rabit_reduce_buffer");
|
||||||
|
env_vars.push_back("rabit_tracker_uri");
|
||||||
|
env_vars.push_back("rabit_tracker_port");
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialization function
|
// initialization function
|
||||||
void AllreduceBase::Init(void) {
|
void AllreduceBase::Init(void) {
|
||||||
// setup from enviroment variables
|
// setup from enviroment variables
|
||||||
|
// handler to get variables from env
|
||||||
|
for (size_t i = 0; i < env_vars.size(); ++i) {
|
||||||
|
const char *value = getenv(env_vars[i].c_str());
|
||||||
|
if (value != NULL) {
|
||||||
|
this->SetParam(env_vars[i].c_str(), value);
|
||||||
|
}
|
||||||
|
}
|
||||||
{
|
{
|
||||||
// handling for hadoop
|
// handling for hadoop
|
||||||
const char *task_id = getenv("mapred_tip_id");
|
const char *task_id = getenv("mapred_tip_id");
|
||||||
|
|||||||
@ -413,6 +413,8 @@ class AllreduceBase : public IEngine {
|
|||||||
// pointer to links in the ring
|
// pointer to links in the ring
|
||||||
LinkRecord *ring_prev, *ring_next;
|
LinkRecord *ring_prev, *ring_next;
|
||||||
//----- meta information-----
|
//----- meta information-----
|
||||||
|
// list of enviroment variables that are of possible interest
|
||||||
|
std::vector<std::string> env_vars;
|
||||||
// unique identifier of the possible job this process is doing
|
// unique identifier of the possible job this process is doing
|
||||||
// used to assign ranks, optional, default to NULL
|
// used to assign ranks, optional, default to NULL
|
||||||
std::string task_id;
|
std::string task_id;
|
||||||
|
|||||||
@ -27,7 +27,9 @@ AllreduceRobust::AllreduceRobust(void) {
|
|||||||
result_buffer_round = 1;
|
result_buffer_round = 1;
|
||||||
global_lazycheck = NULL;
|
global_lazycheck = NULL;
|
||||||
use_local_model = -1;
|
use_local_model = -1;
|
||||||
recover_counter = 0;
|
recover_counter = 0;
|
||||||
|
env_vars.push_back("rabit_global_replica");
|
||||||
|
env_vars.push_back("rabit_local_replica");
|
||||||
}
|
}
|
||||||
void AllreduceRobust::Init(void) {
|
void AllreduceRobust::Init(void) {
|
||||||
AllreduceBase::Init();
|
AllreduceBase::Init();
|
||||||
|
|||||||
12
subtree/rabit/tracker/README.md
Normal file
12
subtree/rabit/tracker/README.md
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
Trackers
|
||||||
|
=====
|
||||||
|
This folder contains tracker scripts that can be used to submit yarn jobs to different platforms,
|
||||||
|
the example guidelines are in the script themselfs
|
||||||
|
|
||||||
|
***Supported Platforms***
|
||||||
|
* Local demo: [rabit_demo.py](rabit_demo.py)
|
||||||
|
* MPI: [rabit_mpi.py](rabit_mpi.py)
|
||||||
|
* Yarn (Hadoop): [rabit_yarn.py](rabit_yarn.py)
|
||||||
|
- It is also possible to submit via hadoop streaming with rabit_hadoop_streaming.py
|
||||||
|
- However, it is higly recommended to use rabit_yarn.py because this will allocate resources more precisely and fits machine learning scenarios
|
||||||
|
|
||||||
@ -31,35 +31,38 @@ nrep=0
|
|||||||
rc=254
|
rc=254
|
||||||
while [ $rc -eq 254 ];
|
while [ $rc -eq 254 ];
|
||||||
do
|
do
|
||||||
|
export rabit_num_trial=$nrep
|
||||||
%s
|
%s
|
||||||
%s %s rabit_num_trial=$nrep
|
%s
|
||||||
rc=$?;
|
rc=$?;
|
||||||
nrep=$((nrep+1));
|
nrep=$((nrep+1));
|
||||||
done
|
done
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def exec_cmd(cmd, taskid):
|
def exec_cmd(cmd, taskid, worker_env):
|
||||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
||||||
cmd[0] = './' + cmd[0]
|
cmd[0] = './' + cmd[0]
|
||||||
cmd = ' '.join(cmd)
|
cmd = ' '.join(cmd)
|
||||||
arg = ' rabit_task_id=%d' % (taskid)
|
env = {}
|
||||||
cmd = cmd + arg
|
for k, v in worker_env.items():
|
||||||
|
env[k] = str(v)
|
||||||
|
env['rabit_task_id'] = str(taskid)
|
||||||
|
env['PYTHONPATH'] = WRAPPER_PATH
|
||||||
|
|
||||||
ntrial = 0
|
ntrial = 0
|
||||||
while True:
|
while True:
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
prep = 'SET PYTHONPATH=\"%s\"\n' % WRAPPER_PATH
|
env['rabit_num_trial'] = str(ntrial)
|
||||||
ret = subprocess.call(prep + cmd + ('rabit_num_trial=%d' % ntrial), shell=True)
|
ret = subprocess.call(cmd, shell=True, env = env)
|
||||||
if ret == 254:
|
if ret == 254:
|
||||||
ntrial += 1
|
ntrial += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
|
if args.verbose != 0:
|
||||||
if args.verbose != 0:
|
bash = keepalive % (echo % cmd, cmd)
|
||||||
bash = keepalive % (echo % cmd, prep, cmd)
|
|
||||||
else:
|
else:
|
||||||
bash = keepalive % ('', prep, cmd)
|
bash = keepalive % ('', cmd)
|
||||||
ret = subprocess.call(bash, shell=True, executable='bash')
|
ret = subprocess.call(bash, shell=True, executable='bash', env = env)
|
||||||
if ret == 0:
|
if ret == 0:
|
||||||
if args.verbose != 0:
|
if args.verbose != 0:
|
||||||
print 'Thread %d exit with 0' % taskid
|
print 'Thread %d exit with 0' % taskid
|
||||||
@ -73,7 +76,7 @@ def exec_cmd(cmd, taskid):
|
|||||||
# Note: this submit script is only used for demo purpose
|
# Note: this submit script is only used for demo purpose
|
||||||
# submission script using pyhton multi-threading
|
# submission script using pyhton multi-threading
|
||||||
#
|
#
|
||||||
def mthread_submit(nslave, worker_args):
|
def mthread_submit(nslave, worker_args, worker_envs):
|
||||||
"""
|
"""
|
||||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||||
note this can be a lambda function containing additional parameters in input
|
note this can be a lambda function containing additional parameters in input
|
||||||
@ -84,7 +87,7 @@ def mthread_submit(nslave, worker_args):
|
|||||||
"""
|
"""
|
||||||
procs = {}
|
procs = {}
|
||||||
for i in range(nslave):
|
for i in range(nslave):
|
||||||
procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i))
|
procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i, worker_envs))
|
||||||
procs[i].daemon = True
|
procs[i].daemon = True
|
||||||
procs[i].start()
|
procs[i].start()
|
||||||
for i in range(nslave):
|
for i in range(nslave):
|
||||||
|
|||||||
@ -1,7 +1,11 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
"""
|
"""
|
||||||
|
Deprecated
|
||||||
|
|
||||||
This is a script to submit rabit job using hadoop streaming.
|
This is a script to submit rabit job using hadoop streaming.
|
||||||
It will submit the rabit process as mappers of MapReduce.
|
It will submit the rabit process as mappers of MapReduce.
|
||||||
|
|
||||||
|
This script is deprecated, it is highly recommended to use rabit_yarn.py instead
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
@ -34,13 +38,11 @@ if hadoop_binary == None or hadoop_streaming_jar == None:
|
|||||||
', or modify rabit_hadoop.py line 16', stacklevel = 2)
|
', or modify rabit_hadoop.py line 16', stacklevel = 2)
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
|
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
|
||||||
'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended')
|
'It is Highly recommended to use rabit_yarn.py instead')
|
||||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||||
help = 'number of worker proccess to be launched')
|
help = 'number of worker proccess to be launched')
|
||||||
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||||
parser.add_argument('-nt', '--nthread', default = -1, type=int,
|
|
||||||
help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
|
|
||||||
parser.add_argument('-i', '--input', required=True,
|
parser.add_argument('-i', '--input', required=True,
|
||||||
help = 'input path in HDFS')
|
help = 'input path in HDFS')
|
||||||
parser.add_argument('-o', '--output', required=True,
|
parser.add_argument('-o', '--output', required=True,
|
||||||
@ -61,6 +63,8 @@ parser.add_argument('--jobname', default='auto', help = 'customize jobname in tr
|
|||||||
parser.add_argument('--timeout', default=600000000, type=int,
|
parser.add_argument('--timeout', default=600000000, type=int,
|
||||||
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
|
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
|
||||||
'normally you do not need to set this ')
|
'normally you do not need to set this ')
|
||||||
|
parser.add_argument('--vcores', default = -1, type=int,
|
||||||
|
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||||
parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
|
parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
|
||||||
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||||
'if you are running multi-threading rabit,'\
|
'if you are running multi-threading rabit,'\
|
||||||
@ -91,10 +95,14 @@ out = out.split('\n')[0].split()
|
|||||||
assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
|
assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
|
||||||
hadoop_version = out[1].split('.')
|
hadoop_version = out[1].split('.')
|
||||||
use_yarn = int(hadoop_version[0]) >= 2
|
use_yarn = int(hadoop_version[0]) >= 2
|
||||||
|
if use_yarn:
|
||||||
|
warnings.warn('It is highly recommended to use rabit_yarn.py to submit jobs to yarn instead', stacklevel = 2)
|
||||||
|
|
||||||
print 'Current Hadoop Version is %s' % out[1]
|
print 'Current Hadoop Version is %s' % out[1]
|
||||||
|
|
||||||
def hadoop_streaming(nworker, worker_args, use_yarn):
|
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
||||||
|
worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
|
||||||
|
worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
|
||||||
fset = set()
|
fset = set()
|
||||||
if args.auto_file_cache:
|
if args.auto_file_cache:
|
||||||
for i in range(len(args.command)):
|
for i in range(len(args.command)):
|
||||||
@ -113,6 +121,7 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
|||||||
if os.path.exists(f):
|
if os.path.exists(f):
|
||||||
fset.add(f)
|
fset.add(f)
|
||||||
kmap = {}
|
kmap = {}
|
||||||
|
kmap['env'] = 'mapred.child.env'
|
||||||
# setup keymaps
|
# setup keymaps
|
||||||
if use_yarn:
|
if use_yarn:
|
||||||
kmap['nworker'] = 'mapreduce.job.maps'
|
kmap['nworker'] = 'mapreduce.job.maps'
|
||||||
@ -129,12 +138,14 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
|||||||
cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
|
cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
|
||||||
cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
|
cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
|
||||||
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
||||||
if args.nthread != -1:
|
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
|
||||||
|
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
|
||||||
|
if args.vcores != -1:
|
||||||
if kmap['nthread'] is None:
|
if kmap['nthread'] is None:
|
||||||
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
||||||
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
|
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
|
||||||
else:
|
else:
|
||||||
cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread)
|
cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
|
||||||
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
||||||
if args.memory_mb != -1:
|
if args.memory_mb != -1:
|
||||||
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
||||||
@ -150,5 +161,5 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
|||||||
print cmd
|
print cmd
|
||||||
subprocess.check_call(cmd, shell = True)
|
subprocess.check_call(cmd, shell = True)
|
||||||
|
|
||||||
fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2)
|
fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
|
||||||
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
|
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
|
||||||
@ -22,7 +22,7 @@ args = parser.parse_args()
|
|||||||
#
|
#
|
||||||
# submission script using MPI
|
# submission script using MPI
|
||||||
#
|
#
|
||||||
def mpi_submit(nslave, worker_args):
|
def mpi_submit(nslave, worker_args, worker_envs):
|
||||||
"""
|
"""
|
||||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||||
note this can be a lambda function containing additional parameters in input
|
note this can be a lambda function containing additional parameters in input
|
||||||
@ -31,6 +31,7 @@ def mpi_submit(nslave, worker_args):
|
|||||||
args arguments to launch each job
|
args arguments to launch each job
|
||||||
this usually includes the parameters of master_uri and parameters passed into submit
|
this usually includes the parameters of master_uri and parameters passed into submit
|
||||||
"""
|
"""
|
||||||
|
worker_args += ['%s=%s' % (k, str(v)) for k, v in worker_envs.items()]
|
||||||
sargs = ' '.join(args.command + worker_args)
|
sargs = ' '.join(args.command + worker_args)
|
||||||
if args.hostfile is None:
|
if args.hostfile is None:
|
||||||
cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args)
|
cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args)
|
||||||
|
|||||||
@ -134,19 +134,25 @@ class Tracker:
|
|||||||
sock.listen(16)
|
sock.listen(16)
|
||||||
self.sock = sock
|
self.sock = sock
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
if hostIP == 'auto':
|
||||||
|
hostIP = 'dns'
|
||||||
self.hostIP = hostIP
|
self.hostIP = hostIP
|
||||||
self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
|
self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.sock.close()
|
self.sock.close()
|
||||||
def slave_args(self):
|
def slave_envs(self):
|
||||||
if self.hostIP == 'auto':
|
"""
|
||||||
|
get enviroment variables for slaves
|
||||||
|
can be passed in as args or envs
|
||||||
|
"""
|
||||||
|
if self.hostIP == 'dns':
|
||||||
host = socket.gethostname()
|
host = socket.gethostname()
|
||||||
elif self.hostIP == 'ip':
|
elif self.hostIP == 'ip':
|
||||||
host = socket.gethostbyname(socket.getfqdn())
|
host = socket.gethostbyname(socket.getfqdn())
|
||||||
else:
|
else:
|
||||||
host = self.hostIP
|
host = self.hostIP
|
||||||
return ['rabit_tracker_uri=%s' % host,
|
return {'rabit_tracker_uri': host,
|
||||||
'rabit_tracker_port=%s' % self.port]
|
'rabit_tracker_port': self.port}
|
||||||
def get_neighbor(self, rank, nslave):
|
def get_neighbor(self, rank, nslave):
|
||||||
rank = rank + 1
|
rank = rank + 1
|
||||||
ret = []
|
ret = []
|
||||||
@ -261,9 +267,9 @@ class Tracker:
|
|||||||
wait_conn[rank] = s
|
wait_conn[rank] = s
|
||||||
self.log_print('@tracker All nodes finishes job', 2)
|
self.log_print('@tracker All nodes finishes job', 2)
|
||||||
|
|
||||||
def submit(nslave, args, fun_submit, verbose, hostIP):
|
def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
|
||||||
master = Tracker(verbose = verbose, hostIP = hostIP)
|
master = Tracker(verbose = verbose, hostIP = hostIP)
|
||||||
submit_thread = Thread(target = fun_submit, args = (nslave, args + master.slave_args()))
|
submit_thread = Thread(target = fun_submit, args = (nslave, args, master.slave_envs()))
|
||||||
submit_thread.daemon = True
|
submit_thread.daemon = True
|
||||||
submit_thread.start()
|
submit_thread.start()
|
||||||
master.accept_slaves(nslave)
|
master.accept_slaves(nslave)
|
||||||
|
|||||||
122
subtree/rabit/tracker/rabit_yarn.py
Executable file
122
subtree/rabit/tracker/rabit_yarn.py
Executable file
@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
"""
|
||||||
|
This is a script to submit rabit job via Yarn
|
||||||
|
rabit will run as a Yarn application
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import subprocess
|
||||||
|
import warnings
|
||||||
|
import rabit_tracker as tracker
|
||||||
|
|
||||||
|
WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
|
||||||
|
YARN_JAR_PATH = os.path.dirname(__file__) + '/../yarn/rabit-yarn.jar'
|
||||||
|
|
||||||
|
assert os.path.exists(YARN_JAR_PATH), ("cannot find \"%s\", please run build.sh on the yarn folder" % YARN_JAR_PATH)
|
||||||
|
hadoop_binary = 'hadoop'
|
||||||
|
# code
|
||||||
|
hadoop_home = os.getenv('HADOOP_HOME')
|
||||||
|
|
||||||
|
if hadoop_home != None:
|
||||||
|
if hadoop_binary == None:
|
||||||
|
hadoop_binary = hadoop_home + '/bin/hadoop'
|
||||||
|
assert os.path.exists(hadoop_binary), "HADOOP_HOME does not contain the hadoop binary"
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs to Yarn.')
|
||||||
|
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||||
|
help = 'number of worker proccess to be launched')
|
||||||
|
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||||
|
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||||
|
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||||
|
help = 'print more messages into the console')
|
||||||
|
parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
|
||||||
|
help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
|
||||||
|
parser.add_argument('-f', '--files', default = [], action='append',
|
||||||
|
help = 'the cached file list in mapreduce,'\
|
||||||
|
' the submission script will automatically cache all the files which appears in command'\
|
||||||
|
' This will also cause rewritten of all the file names in the command to current path,'\
|
||||||
|
' for example `../../kmeans ../kmeans.conf` will be rewritten to `./kmeans kmeans.conf`'\
|
||||||
|
' because the two files are cached to running folder.'\
|
||||||
|
' You may need this option to cache additional files.'\
|
||||||
|
' You can also use it to manually cache files when auto_file_cache is off')
|
||||||
|
parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
|
||||||
|
parser.add_argument('--tempdir', default='/tmp', help = 'temporary directory in HDFS that can be used to store intermediate results')
|
||||||
|
parser.add_argument('--vcores', default = 1, type=int,
|
||||||
|
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||||
|
parser.add_argument('-mem', '--memory_mb', default=1024, type=int,
|
||||||
|
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||||
|
'if you are running multi-threading rabit,'\
|
||||||
|
'so that each node can occupy all the mapper slots in a machine for maximum performance')
|
||||||
|
parser.add_argument('command', nargs='+',
|
||||||
|
help = 'command for rabit program')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.jobname == 'auto':
|
||||||
|
args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
|
||||||
|
|
||||||
|
if hadoop_binary == None:
|
||||||
|
parser.add_argument('-hb', '--hadoop_binary', required = True,
|
||||||
|
help="path to hadoop binary file")
|
||||||
|
else:
|
||||||
|
parser.add_argument('-hb', '--hadoop_binary', default = hadoop_binary,
|
||||||
|
help="path to hadoop binary file")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.jobname == 'auto':
|
||||||
|
args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
|
||||||
|
|
||||||
|
# detech hadoop version
|
||||||
|
(out, err) = subprocess.Popen('%s version' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
|
||||||
|
out = out.split('\n')[0].split()
|
||||||
|
assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
|
||||||
|
hadoop_version = out[1].split('.')
|
||||||
|
|
||||||
|
(classpath, err) = subprocess.Popen('%s classpath --glob' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
|
||||||
|
|
||||||
|
if hadoop_version < 2:
|
||||||
|
print 'Current Hadoop Version is %s, rabit_yarn will need Yarn(Hadoop 2.0)' % out[1]
|
||||||
|
|
||||||
|
def submit_yarn(nworker, worker_args, worker_env):
|
||||||
|
fset = set([YARN_JAR_PATH])
|
||||||
|
if args.auto_file_cache != 0:
|
||||||
|
for i in range(len(args.command)):
|
||||||
|
f = args.command[i]
|
||||||
|
if os.path.exists(f):
|
||||||
|
fset.add(f)
|
||||||
|
if i == 0:
|
||||||
|
args.command[i] = './' + args.command[i].split('/')[-1]
|
||||||
|
else:
|
||||||
|
args.command[i] = args.command[i].split('/')[-1]
|
||||||
|
if args.command[0].endswith('.py'):
|
||||||
|
flst = [WRAPPER_PATH + '/rabit.py',
|
||||||
|
WRAPPER_PATH + '/librabit_wrapper.so',
|
||||||
|
WRAPPER_PATH + '/librabit_wrapper_mock.so']
|
||||||
|
for f in flst:
|
||||||
|
if os.path.exists(f):
|
||||||
|
fset.add(f)
|
||||||
|
|
||||||
|
cmd = 'java -cp `%s classpath`:%s org.apache.hadoop.yarn.rabit.Client ' % (args.hadoop_binary, YARN_JAR_PATH)
|
||||||
|
env = os.environ.copy()
|
||||||
|
for k, v in worker_env.items():
|
||||||
|
env[k] = str(v)
|
||||||
|
env['rabit_cpu_vcores'] = str(args.vcores)
|
||||||
|
env['rabit_memory_mb'] = str(args.memory_mb)
|
||||||
|
env['rabit_world_size'] = str(args.nworker)
|
||||||
|
|
||||||
|
if args.files != None:
|
||||||
|
for flst in args.files:
|
||||||
|
for f in flst.split('#'):
|
||||||
|
fset.add(f)
|
||||||
|
for f in fset:
|
||||||
|
cmd += ' -file %s' % f
|
||||||
|
cmd += ' -jobname %s ' % args.jobname
|
||||||
|
cmd += ' -tempdir %s ' % args.tempdir
|
||||||
|
cmd += (' '.join(args.command + worker_args))
|
||||||
|
print cmd
|
||||||
|
subprocess.check_call(cmd, shell = True, env = env)
|
||||||
|
|
||||||
|
tracker.submit(args.nworker, [], fun_submit = submit_yarn, verbose = args.verbose, hostIP = args.host_ip)
|
||||||
4
subtree/rabit/yarn/.gitignore
vendored
Normal file
4
subtree/rabit/yarn/.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
bin
|
||||||
|
.classpath
|
||||||
|
.project
|
||||||
|
*.jar
|
||||||
5
subtree/rabit/yarn/README.md
Normal file
5
subtree/rabit/yarn/README.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
rabit-yarn
|
||||||
|
=====
|
||||||
|
* This folder contains Application code to allow rabit run on Yarn.
|
||||||
|
* You can use [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) to submit the job
|
||||||
|
- run ```./build.sh``` to build the jar, before using the script
|
||||||
1
subtree/rabit/yarn/bin/README
Normal file
1
subtree/rabit/yarn/bin/README
Normal file
@ -0,0 +1 @@
|
|||||||
|
foler used to hold generated class files
|
||||||
4
subtree/rabit/yarn/build.sh
Executable file
4
subtree/rabit/yarn/build.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
CPATH=`${HADOOP_PREFIX}/bin/hadoop classpath`
|
||||||
|
javac -cp $CPATH -d bin src/org/apache/hadoop/yarn/rabit/*
|
||||||
|
jar cf rabit-yarn.jar -C bin .
|
||||||
@ -0,0 +1,508 @@
|
|||||||
|
package org.apache.hadoop.yarn.rabit;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
|
import org.apache.hadoop.yarn.util.Records;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||||
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
|
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||||
|
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Priority;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
|
import org.apache.hadoop.yarn.api.records.NodeReport;
|
||||||
|
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||||
|
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
|
||||||
|
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* application master for allocating resources of rabit client
|
||||||
|
*
|
||||||
|
* @author Tianqi Chen
|
||||||
|
*/
|
||||||
|
public class ApplicationMaster {
|
||||||
|
// logger
|
||||||
|
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
|
||||||
|
// configuration
|
||||||
|
private Configuration conf = new YarnConfiguration();
|
||||||
|
// hdfs handler
|
||||||
|
private FileSystem dfs;
|
||||||
|
|
||||||
|
// number of cores allocated for each task
|
||||||
|
private int numVCores = 1;
|
||||||
|
// memory needed requested for the task
|
||||||
|
private int numMemoryMB = 10;
|
||||||
|
// priority of the app master
|
||||||
|
private int appPriority = 0;
|
||||||
|
// total number of tasks
|
||||||
|
private int numTasks = 1;
|
||||||
|
// maximum number of attempts to try in each task
|
||||||
|
private int maxNumAttempt = 3;
|
||||||
|
// command to launch
|
||||||
|
private String command = "";
|
||||||
|
|
||||||
|
// application tracker hostname
|
||||||
|
private String appHostName = "";
|
||||||
|
// tracker URL to do
|
||||||
|
private String appTrackerUrl = "";
|
||||||
|
// tracker port
|
||||||
|
private int appTrackerPort = 0;
|
||||||
|
|
||||||
|
// whether we start to abort the application, due to whatever fatal reasons
|
||||||
|
private boolean startAbort = false;
|
||||||
|
// worker resources
|
||||||
|
private Map<String, LocalResource> workerResources = new java.util.HashMap<String, LocalResource>();
|
||||||
|
// record the aborting reason
|
||||||
|
private String abortDiagnosis = "";
|
||||||
|
// resource manager
|
||||||
|
private AMRMClientAsync<ContainerRequest> rmClient = null;
|
||||||
|
// node manager
|
||||||
|
private NMClientAsync nmClient = null;
|
||||||
|
|
||||||
|
// list of tasks that pending for resources to be allocated
|
||||||
|
private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
// map containerId->task record of tasks that was running
|
||||||
|
private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
|
||||||
|
// collection of tasks
|
||||||
|
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
// collection of killed tasks
|
||||||
|
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
new ApplicationMaster().run(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ApplicationMaster() throws IOException {
|
||||||
|
dfs = FileSystem.get(conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get integer argument from environment variable
|
||||||
|
*
|
||||||
|
* @param name
|
||||||
|
* name of key
|
||||||
|
* @param required
|
||||||
|
* whether this is required
|
||||||
|
* @param defv
|
||||||
|
* default value
|
||||||
|
* @return the requested result
|
||||||
|
*/
|
||||||
|
private int getEnvInteger(String name, boolean required, int defv)
|
||||||
|
throws IOException {
|
||||||
|
String value = System.getenv(name);
|
||||||
|
if (value == null) {
|
||||||
|
if (required) {
|
||||||
|
throw new IOException("environment variable " + name
|
||||||
|
+ " not set");
|
||||||
|
} else {
|
||||||
|
return defv;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Integer.valueOf(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* initialize from arguments and command lines
|
||||||
|
*
|
||||||
|
* @param args
|
||||||
|
*/
|
||||||
|
private void initArgs(String args[]) throws IOException {
|
||||||
|
LOG.info("Invoke initArgs");
|
||||||
|
// cached maps
|
||||||
|
Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
|
||||||
|
for (int i = 0; i < args.length; ++i) {
|
||||||
|
if (args[i].equals("-file")) {
|
||||||
|
String[] arr = args[++i].split("#");
|
||||||
|
Path path = new Path(arr[0]);
|
||||||
|
if (arr.length == 1) {
|
||||||
|
cacheFiles.put(path.getName(), path);
|
||||||
|
} else {
|
||||||
|
cacheFiles.put(arr[1], path);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.command += args[i] + " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Map.Entry<String, Path> e : cacheFiles.entrySet()) {
|
||||||
|
LocalResource r = Records.newRecord(LocalResource.class);
|
||||||
|
FileStatus status = dfs.getFileStatus(e.getValue());
|
||||||
|
r.setResource(ConverterUtils.getYarnUrlFromPath(e.getValue()));
|
||||||
|
r.setSize(status.getLen());
|
||||||
|
r.setTimestamp(status.getModificationTime());
|
||||||
|
r.setType(LocalResourceType.FILE);
|
||||||
|
r.setVisibility(LocalResourceVisibility.APPLICATION);
|
||||||
|
workerResources.put(e.getKey(), r);
|
||||||
|
}
|
||||||
|
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
||||||
|
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
||||||
|
numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
|
||||||
|
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* called to start the application
|
||||||
|
*/
|
||||||
|
private void run(String args[]) throws Exception {
|
||||||
|
this.initArgs(args);
|
||||||
|
this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000,
|
||||||
|
new RMCallbackHandler());
|
||||||
|
this.nmClient = NMClientAsync
|
||||||
|
.createNMClientAsync(new NMCallbackHandler());
|
||||||
|
this.rmClient.init(conf);
|
||||||
|
this.rmClient.start();
|
||||||
|
this.nmClient.init(conf);
|
||||||
|
this.nmClient.start();
|
||||||
|
RegisterApplicationMasterResponse response = this.rmClient
|
||||||
|
.registerApplicationMaster(this.appHostName,
|
||||||
|
this.appTrackerPort, this.appTrackerUrl);
|
||||||
|
|
||||||
|
boolean success = false;
|
||||||
|
String diagnostics = "";
|
||||||
|
try {
|
||||||
|
// list of tasks that waits to be submit
|
||||||
|
java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
// add waiting tasks
|
||||||
|
for (int i = 0; i < this.numTasks; ++i) {
|
||||||
|
tasks.add(new TaskRecord(i));
|
||||||
|
}
|
||||||
|
Resource maxResource = response.getMaximumResourceCapability();
|
||||||
|
|
||||||
|
if (maxResource.getMemory() < this.numMemoryMB) {
|
||||||
|
LOG.warn("[Rabit] memory requested exceed bound "
|
||||||
|
+ maxResource.getMemory());
|
||||||
|
this.numMemoryMB = maxResource.getMemory();
|
||||||
|
}
|
||||||
|
if (maxResource.getVirtualCores() < this.numVCores) {
|
||||||
|
LOG.warn("[Rabit] memory requested exceed bound "
|
||||||
|
+ maxResource.getVirtualCores());
|
||||||
|
this.numVCores = maxResource.getVirtualCores();
|
||||||
|
}
|
||||||
|
this.submitTasks(tasks);
|
||||||
|
LOG.info("[Rabit] ApplicationMaster started");
|
||||||
|
while (!this.doneAllJobs()) {
|
||||||
|
try {
|
||||||
|
Thread.sleep(100);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert (killedTasks.size() + finishedTasks.size() == numTasks);
|
||||||
|
success = finishedTasks.size() == numTasks;
|
||||||
|
LOG.info("Application completed. Stopping running containers");
|
||||||
|
nmClient.stop();
|
||||||
|
diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
|
||||||
|
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||||
|
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||||
|
LOG.info(diagnostics);
|
||||||
|
} catch (Exception e) {
|
||||||
|
diagnostics = e.toString();
|
||||||
|
}
|
||||||
|
rmClient.unregisterApplicationMaster(
|
||||||
|
success ? FinalApplicationStatus.SUCCEEDED
|
||||||
|
: FinalApplicationStatus.FAILED, diagnostics,
|
||||||
|
appTrackerUrl);
|
||||||
|
if (!success) throw new Exception("Application not successful");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check if the job finishes
|
||||||
|
*
|
||||||
|
* @return whether we finished all the jobs
|
||||||
|
*/
|
||||||
|
private synchronized boolean doneAllJobs() {
|
||||||
|
return pendingTasks.size() == 0 && runningTasks.size() == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* submit tasks to request containers for the tasks
|
||||||
|
*
|
||||||
|
* @param tasks
|
||||||
|
* a collection of tasks we want to ask container for
|
||||||
|
*/
|
||||||
|
private synchronized void submitTasks(Collection<TaskRecord> tasks) {
|
||||||
|
for (TaskRecord r : tasks) {
|
||||||
|
Resource resource = Records.newRecord(Resource.class);
|
||||||
|
resource.setMemory(numMemoryMB);
|
||||||
|
resource.setVirtualCores(numVCores);
|
||||||
|
Priority priority = Records.newRecord(Priority.class);
|
||||||
|
priority.setPriority(this.appPriority);
|
||||||
|
r.containerRequest = new ContainerRequest(resource, null, null,
|
||||||
|
priority);
|
||||||
|
rmClient.addContainerRequest(r.containerRequest);
|
||||||
|
pendingTasks.add(r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* launch the task on container
|
||||||
|
*
|
||||||
|
* @param container
|
||||||
|
* container to run the task
|
||||||
|
* @param task
|
||||||
|
* the task
|
||||||
|
*/
|
||||||
|
private void launchTask(Container container, TaskRecord task) {
|
||||||
|
task.container = container;
|
||||||
|
task.containerRequest = null;
|
||||||
|
ContainerLaunchContext ctx = Records
|
||||||
|
.newRecord(ContainerLaunchContext.class);
|
||||||
|
String cmd =
|
||||||
|
// use this to setup CLASSPATH correctly for libhdfs
|
||||||
|
"CLASSPATH=${CLASSPATH}:`${HADOOP_PREFIX}/bin/hadoop classpath --glob` "
|
||||||
|
+ this.command + " 1>"
|
||||||
|
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||||
|
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||||
|
+ "/stderr";
|
||||||
|
LOG.info(cmd);
|
||||||
|
ctx.setCommands(Collections.singletonList(cmd));
|
||||||
|
LOG.info(workerResources);
|
||||||
|
ctx.setLocalResources(this.workerResources);
|
||||||
|
// setup environment variables
|
||||||
|
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||||
|
|
||||||
|
// setup class path, this is kind of duplicated, ignoring
|
||||||
|
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
||||||
|
for (String c : conf.getStrings(
|
||||||
|
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||||
|
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||||
|
cpath.append(':');
|
||||||
|
cpath.append(c.trim());
|
||||||
|
}
|
||||||
|
// already use hadoop command to get class path in worker, maybe a better solution in future
|
||||||
|
// env.put("CLASSPATH", cpath.toString());
|
||||||
|
// setup LD_LIBARY_PATH path for libhdfs
|
||||||
|
env.put("LD_LIBRARY_PATH",
|
||||||
|
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
|
||||||
|
env.put("PYTHONPATH", "${PYTHONPATH}:.");
|
||||||
|
// inherit all rabit variables
|
||||||
|
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
||||||
|
if (e.getKey().startsWith("rabit_")) {
|
||||||
|
env.put(e.getKey(), e.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
env.put("rabit_task_id", String.valueOf(task.taskId));
|
||||||
|
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
||||||
|
|
||||||
|
ctx.setEnvironment(env);
|
||||||
|
synchronized (this) {
|
||||||
|
assert (!this.runningTasks.containsKey(container.getId()));
|
||||||
|
this.runningTasks.put(container.getId(), task);
|
||||||
|
this.nmClient.startContainerAsync(container, ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* free the containers that have not yet been launched
|
||||||
|
*
|
||||||
|
* @param containers
|
||||||
|
*/
|
||||||
|
private synchronized void freeUnusedContainers(
|
||||||
|
Collection<Container> containers) {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
||||||
|
*
|
||||||
|
* @param containers
|
||||||
|
*/
|
||||||
|
private synchronized void onContainersAllocated(List<Container> containers) {
|
||||||
|
if (this.startAbort) {
|
||||||
|
this.freeUnusedContainers(containers);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Collection<Container> freelist = new java.util.LinkedList<Container>();
|
||||||
|
for (Container c : containers) {
|
||||||
|
TaskRecord task;
|
||||||
|
task = pendingTasks.poll();
|
||||||
|
if (task == null) {
|
||||||
|
freelist.add(c);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
this.launchTask(c, task);
|
||||||
|
}
|
||||||
|
this.freeUnusedContainers(freelist);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* start aborting the job
|
||||||
|
*
|
||||||
|
* @param msg
|
||||||
|
* the fatal message
|
||||||
|
*/
|
||||||
|
private synchronized void abortJob(String msg) {
|
||||||
|
if (!this.startAbort)
|
||||||
|
this.abortDiagnosis = msg;
|
||||||
|
this.startAbort = true;
|
||||||
|
for (TaskRecord r : this.runningTasks.values()) {
|
||||||
|
if (!r.abortRequested) {
|
||||||
|
nmClient.stopContainerAsync(r.container.getId(),
|
||||||
|
r.container.getNodeId());
|
||||||
|
r.abortRequested = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.killedTasks.addAll(this.pendingTasks);
|
||||||
|
for (TaskRecord r : this.pendingTasks) {
|
||||||
|
rmClient.removeContainerRequest(r.containerRequest);
|
||||||
|
}
|
||||||
|
this.pendingTasks.clear();
|
||||||
|
LOG.info(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* handle non fatal failures
|
||||||
|
*
|
||||||
|
* @param cid
|
||||||
|
*/
|
||||||
|
private synchronized void handleFailure(Collection<ContainerId> failed) {
|
||||||
|
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
for (ContainerId cid : failed) {
|
||||||
|
TaskRecord r = runningTasks.remove(cid);
|
||||||
|
if (r == null)
|
||||||
|
continue;
|
||||||
|
r.attemptCounter += 1;
|
||||||
|
r.container = null;
|
||||||
|
tasks.add(r);
|
||||||
|
if (r.attemptCounter >= this.maxNumAttempt) {
|
||||||
|
this.abortJob("[Rabit] Task " + r.taskId + " failed more than "
|
||||||
|
+ r.attemptCounter + "times");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (this.startAbort) {
|
||||||
|
this.killedTasks.addAll(tasks);
|
||||||
|
} else {
|
||||||
|
this.submitTasks(tasks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
||||||
|
*
|
||||||
|
* @param status
|
||||||
|
* list of status
|
||||||
|
*/
|
||||||
|
private synchronized void onContainersCompleted(List<ContainerStatus> status) {
|
||||||
|
Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>();
|
||||||
|
for (ContainerStatus s : status) {
|
||||||
|
assert (s.getState().equals(ContainerState.COMPLETE));
|
||||||
|
int exstatus = s.getExitStatus();
|
||||||
|
TaskRecord r = runningTasks.get(s.getContainerId());
|
||||||
|
if (r == null)
|
||||||
|
continue;
|
||||||
|
if (exstatus == ContainerExitStatus.SUCCESS) {
|
||||||
|
finishedTasks.add(r);
|
||||||
|
runningTasks.remove(s.getContainerId());
|
||||||
|
} else {
|
||||||
|
switch (exstatus) {
|
||||||
|
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
||||||
|
this.abortJob("[Rabit] Task "
|
||||||
|
+ r.taskId
|
||||||
|
+ " killed because of exceeding allocated physical memory");
|
||||||
|
break;
|
||||||
|
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
||||||
|
this.abortJob("[Rabit] Task "
|
||||||
|
+ r.taskId
|
||||||
|
+ " killed because of exceeding allocated virtual memory");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOG.info("[Rabit] Task " + r.taskId
|
||||||
|
+ " exited with status " + exstatus);
|
||||||
|
failed.add(s.getContainerId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.handleFailure(failed);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* callback handler for resource manager
|
||||||
|
*/
|
||||||
|
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
|
||||||
|
@Override
|
||||||
|
public float getProgress() {
|
||||||
|
return 1.0f - (float) (pendingTasks.size()) / numTasks;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onContainersAllocated(List<Container> containers) {
|
||||||
|
ApplicationMaster.this.onContainersAllocated(containers);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onContainersCompleted(List<ContainerStatus> status) {
|
||||||
|
ApplicationMaster.this.onContainersCompleted(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onError(Throwable ex) {
|
||||||
|
ApplicationMaster.this.abortJob("[Rabit] Resource manager Error "
|
||||||
|
+ ex.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onNodesUpdated(List<NodeReport> nodereport) {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onShutdownRequest() {
|
||||||
|
ApplicationMaster.this
|
||||||
|
.abortJob("[Rabit] Get shutdown request, start to shutdown...");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
|
||||||
|
@Override
|
||||||
|
public void onContainerStarted(ContainerId cid,
|
||||||
|
Map<String, ByteBuffer> services) {
|
||||||
|
LOG.debug("onContainerStarted Invoked");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onContainerStatusReceived(ContainerId cid,
|
||||||
|
ContainerStatus status) {
|
||||||
|
LOG.debug("onContainerStatusReceived Invoked");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onContainerStopped(ContainerId cid) {
|
||||||
|
LOG.debug("onContainerStopped Invoked");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
|
||||||
|
LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
|
||||||
|
ApplicationMaster.this
|
||||||
|
.handleFailure(Collections.singletonList(cid));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onStartContainerError(ContainerId cid, Throwable ex) {
|
||||||
|
LOG.debug("onStartContainerError Invoked: " + ex.toString());
|
||||||
|
ApplicationMaster.this
|
||||||
|
.handleFailure(Collections.singletonList(cid));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onStopContainerError(ContainerId cid, Throwable ex) {
|
||||||
|
LOG.info("onStopContainerError Invoked: " + ex.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
233
subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/Client.java
Normal file
233
subtree/rabit/yarn/src/org/apache/hadoop/yarn/rabit/Client.java
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
package org.apache.hadoop.yarn.rabit;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||||
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
|
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||||
|
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||||
|
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||||
|
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
|
import org.apache.hadoop.yarn.util.Records;
|
||||||
|
|
||||||
|
public class Client {
|
||||||
|
// logger
|
||||||
|
private static final Log LOG = LogFactory.getLog(Client.class);
|
||||||
|
// permission for temp file
|
||||||
|
private static final FsPermission permTemp = new FsPermission("777");
|
||||||
|
// configuration
|
||||||
|
private YarnConfiguration conf = new YarnConfiguration();
|
||||||
|
// hdfs handler
|
||||||
|
private FileSystem dfs;
|
||||||
|
// cached maps
|
||||||
|
private Map<String, String> cacheFiles = new java.util.HashMap<String, String>();
|
||||||
|
// enviroment variable to setup cachefiles
|
||||||
|
private String cacheFileArg = "";
|
||||||
|
// args to pass to application master
|
||||||
|
private String appArgs = "";
|
||||||
|
// HDFS Path to store temporal result
|
||||||
|
private String tempdir = "/tmp";
|
||||||
|
// job name
|
||||||
|
private String jobName = "";
|
||||||
|
/**
|
||||||
|
* constructor
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private Client() throws IOException {
|
||||||
|
dfs = FileSystem.get(conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ge
|
||||||
|
*
|
||||||
|
* @param fmaps
|
||||||
|
* the file maps
|
||||||
|
* @return the resource map
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private Map<String, LocalResource> setupCacheFiles(ApplicationId appId) throws IOException {
|
||||||
|
// create temporary rabit directory
|
||||||
|
Path tmpPath = new Path(this.tempdir);
|
||||||
|
if (!dfs.exists(tmpPath)) {
|
||||||
|
dfs.mkdirs(tmpPath, permTemp);
|
||||||
|
LOG.info("HDFS temp directory do not exist, creating.. " + tmpPath);
|
||||||
|
}
|
||||||
|
tmpPath = new Path(tmpPath + "/temp-rabit-yarn-" + appId);
|
||||||
|
if (dfs.exists(tmpPath)) {
|
||||||
|
dfs.delete(tmpPath, true);
|
||||||
|
}
|
||||||
|
// create temporary directory
|
||||||
|
FileSystem.mkdirs(dfs, tmpPath, permTemp);
|
||||||
|
|
||||||
|
StringBuilder cstr = new StringBuilder();
|
||||||
|
Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>();
|
||||||
|
for (Map.Entry<String, String> e : cacheFiles.entrySet()) {
|
||||||
|
LocalResource r = Records.newRecord(LocalResource.class);
|
||||||
|
Path path = new Path(e.getValue());
|
||||||
|
// copy local data to temporary folder in HDFS
|
||||||
|
if (!e.getValue().startsWith("hdfs://")) {
|
||||||
|
Path dst = new Path("hdfs://" + tmpPath + "/"+ path.getName());
|
||||||
|
dfs.copyFromLocalFile(false, true, path, dst);
|
||||||
|
dfs.setPermission(dst, permTemp);
|
||||||
|
dfs.deleteOnExit(dst);
|
||||||
|
path = dst;
|
||||||
|
}
|
||||||
|
FileStatus status = dfs.getFileStatus(path);
|
||||||
|
r.setResource(ConverterUtils.getYarnUrlFromPath(path));
|
||||||
|
r.setSize(status.getLen());
|
||||||
|
r.setTimestamp(status.getModificationTime());
|
||||||
|
r.setType(LocalResourceType.FILE);
|
||||||
|
r.setVisibility(LocalResourceVisibility.APPLICATION);
|
||||||
|
rmap.put(e.getKey(), r);
|
||||||
|
cstr.append(" -file \"");
|
||||||
|
cstr.append(path.toString());
|
||||||
|
cstr.append('#');
|
||||||
|
cstr.append(e.getKey());
|
||||||
|
cstr.append("\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
dfs.deleteOnExit(tmpPath);
|
||||||
|
this.cacheFileArg = cstr.toString();
|
||||||
|
return rmap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the environment variables for container
|
||||||
|
*
|
||||||
|
* @return the env variable for child class
|
||||||
|
*/
|
||||||
|
private Map<String, String> getEnvironment() {
|
||||||
|
// Setup environment variables
|
||||||
|
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||||
|
String cpath = "${CLASSPATH}:./*";
|
||||||
|
for (String c : conf.getStrings(
|
||||||
|
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||||
|
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||||
|
cpath += ':';
|
||||||
|
cpath += c.trim();
|
||||||
|
}
|
||||||
|
env.put("CLASSPATH", cpath);
|
||||||
|
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
||||||
|
if (e.getKey().startsWith("rabit_")) {
|
||||||
|
env.put(e.getKey(), e.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG.debug(env);
|
||||||
|
return env;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* initialize the settings
|
||||||
|
*
|
||||||
|
* @param args
|
||||||
|
*/
|
||||||
|
private void initArgs(String[] args) {
|
||||||
|
// directly pass all arguments except args0
|
||||||
|
StringBuilder sargs = new StringBuilder("");
|
||||||
|
for (int i = 0; i < args.length; ++i) {
|
||||||
|
if (args[i].equals("-file")) {
|
||||||
|
String[] arr = args[++i].split("#");
|
||||||
|
if (arr.length == 1) {
|
||||||
|
cacheFiles.put(new Path(arr[0]).getName(), arr[0]);
|
||||||
|
} else {
|
||||||
|
cacheFiles.put(arr[1], arr[0]);
|
||||||
|
}
|
||||||
|
} else if(args[i].equals("-jobname")) {
|
||||||
|
this.jobName = args[++i];
|
||||||
|
} else if(args[i].equals("-tempdir")) {
|
||||||
|
this.tempdir = args[++i];
|
||||||
|
} else {
|
||||||
|
sargs.append(" ");
|
||||||
|
sargs.append(args[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.appArgs = sargs.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run(String[] args) throws Exception {
|
||||||
|
if (args.length == 0) {
|
||||||
|
System.out.println("Usage: [options] [commands..]");
|
||||||
|
System.out.println("options: [-file filename]");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.initArgs(args);
|
||||||
|
// Create yarnClient
|
||||||
|
YarnConfiguration conf = new YarnConfiguration();
|
||||||
|
YarnClient yarnClient = YarnClient.createYarnClient();
|
||||||
|
yarnClient.init(conf);
|
||||||
|
yarnClient.start();
|
||||||
|
|
||||||
|
// Create application via yarnClient
|
||||||
|
YarnClientApplication app = yarnClient.createApplication();
|
||||||
|
|
||||||
|
// Set up the container launch context for the application master
|
||||||
|
ContainerLaunchContext amContainer = Records
|
||||||
|
.newRecord(ContainerLaunchContext.class);
|
||||||
|
ApplicationSubmissionContext appContext = app
|
||||||
|
.getApplicationSubmissionContext();
|
||||||
|
// Submit application
|
||||||
|
ApplicationId appId = appContext.getApplicationId();
|
||||||
|
// setup cache-files and environment variables
|
||||||
|
amContainer.setLocalResources(this.setupCacheFiles(appId));
|
||||||
|
amContainer.setEnvironment(this.getEnvironment());
|
||||||
|
String cmd = "$JAVA_HOME/bin/java"
|
||||||
|
+ " -Xmx256M"
|
||||||
|
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
|
||||||
|
+ this.cacheFileArg + ' ' + this.appArgs + " 1>"
|
||||||
|
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||||
|
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr";
|
||||||
|
LOG.debug(cmd);
|
||||||
|
amContainer.setCommands(Collections.singletonList(cmd));
|
||||||
|
|
||||||
|
// Set up resource type requirements for ApplicationMaster
|
||||||
|
Resource capability = Records.newRecord(Resource.class);
|
||||||
|
capability.setMemory(256);
|
||||||
|
capability.setVirtualCores(1);
|
||||||
|
LOG.info("jobname=" + this.jobName);
|
||||||
|
|
||||||
|
appContext.setApplicationName(jobName + ":RABIT-YARN");
|
||||||
|
appContext.setAMContainerSpec(amContainer);
|
||||||
|
appContext.setResource(capability);
|
||||||
|
appContext.setQueue("default");
|
||||||
|
|
||||||
|
LOG.info("Submitting application " + appId);
|
||||||
|
yarnClient.submitApplication(appContext);
|
||||||
|
|
||||||
|
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
|
||||||
|
YarnApplicationState appState = appReport.getYarnApplicationState();
|
||||||
|
while (appState != YarnApplicationState.FINISHED
|
||||||
|
&& appState != YarnApplicationState.KILLED
|
||||||
|
&& appState != YarnApplicationState.FAILED) {
|
||||||
|
Thread.sleep(100);
|
||||||
|
appReport = yarnClient.getApplicationReport(appId);
|
||||||
|
appState = appReport.getYarnApplicationState();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Application " + appId + " finished with"
|
||||||
|
+ " state " + appState + " at " + appReport.getFinishTime());
|
||||||
|
if (!appReport.getFinalApplicationStatus().equals(
|
||||||
|
FinalApplicationStatus.SUCCEEDED)) {
|
||||||
|
System.err.println(appReport.getDiagnostics());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
new Client().run(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
package org.apache.hadoop.yarn.rabit;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
|
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* data structure to hold the task information
|
||||||
|
*/
|
||||||
|
public class TaskRecord {
|
||||||
|
// task id of the task
|
||||||
|
public int taskId = 0;
|
||||||
|
// number of failed attempts to run the task
|
||||||
|
public int attemptCounter = 0;
|
||||||
|
// container request, can be null if task is already running
|
||||||
|
public ContainerRequest containerRequest = null;
|
||||||
|
// running container, can be null if the task is not launched
|
||||||
|
public Container container = null;
|
||||||
|
// whether we have requested abortion of this task
|
||||||
|
public boolean abortRequested = false;
|
||||||
|
|
||||||
|
public TaskRecord(int taskId) {
|
||||||
|
this.taskId = taskId;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user