[PLUGIN] Add plugin system

This commit is contained in:
tqchen 2016-01-12 11:56:22 -08:00
parent 36c389ac46
commit 96f4542a67
14 changed files with 457 additions and 13 deletions

1
.gitignore vendored
View File

@ -72,3 +72,4 @@ build
config.mk
xgboost
*.data
build_plugin

View File

@ -1,3 +1,8 @@
# flags by plugin
PLUGIN_OBJS=
PLUGIN_LDFLAGS=
PLUGIN_CFLAGS=
ifndef config
ifneq ("$(wildcard ./config.mk)","")
config = config.mk
@ -36,8 +41,8 @@ ifeq ($(OS), Windows_NT)
export CC = gcc -m64
endif
export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS)
export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS)
export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS)
export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include
#java include path
export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
@ -76,7 +81,7 @@ $(RABIT)/lib/$(LIB_RABIT):
java: java/libxgboost4j.so
SRC = $(wildcard src/*.cc src/*/*.cc)
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) $(PLUGIN_OBJS)
AMALGA_OBJ = amalgamation/xgboost-all0.o
LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)
ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP)
@ -87,6 +92,11 @@ build/%.o: src/%.cc
$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
$(CXX) -c $(CFLAGS) -c $< -o $@
build_plugin/%.o: plugin/%.cc
@mkdir -p $(@D)
$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
$(CXX) -c $(CFLAGS) -c $< -o $@
# The should be equivalent to $(ALL_OBJ) except for build/cli_main.o
amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
$(CXX) -c $(CFLAGS) -c $< -o $@
@ -104,20 +114,20 @@ lib/libxgboost.so: $(ALL_DEP)
@mkdir -p $(@D)
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
java/libxgboost4j.so: java/xgboost4j_wrapper.cpp lib/libxgboost.a $(LIB_DEP)
java/libxgboost4j.so: java/xgboost4j_wrapper.cpp $(ALL_DEP)
$(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS)
xgboost: $(CLI_OBJ) lib/libxgboost.a $(LIB_DEP)
xgboost: $(CLI_OBJ) $(ALL_DEP)
$(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
rcpplint:
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src
lint: rcpplint
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin
clean:
$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ amalgamation/*.o xgboost
$(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ amalgamation/*.o xgboost
clean_all: clean
cd $(DMLC_CORE); make clean; cd -
@ -157,3 +167,4 @@ Rcheck:
-include build/*.d
-include build/*/*.d
-include build_plugin/*/*.d

View File

@ -46,3 +46,9 @@ LIB_RABIT = librabit.a
# path to libjvm.so
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
# List of additional plugins, checkout plugin folder.
# uncomment the following lines to include these plugins
# you can also add your own plugin like this
# include plugin/example/plugin.mk

32
plugin/README.md Normal file
View File

@ -0,0 +1,32 @@
XGBoost Plugins Modules
=======================
This folder contains plugin modules to xgboost that can be optionally installed.
The plugin system helps us to extend xgboost with additional features,
and add experimental features that may not yet ready to be included in main project.
To include a certain plugin, say ```plugin_a```, you only need to add the following line to the config.mk.
```makefile
# Add plugin by include the plugin in config
include plugin/plugin_a/plugin.mk
```
Then rebuild libxgboost by typing make, you can get a new library with the plugin enabled.
Link Static XGBoost Library with Plugins
----------------------------------------
This problem only happens when you link ```libxgboost.a```.
If you only use ```libxgboost.so```(this include python and other bindings),
you can ignore this section.
When you want to link ```libxgboost.a``` with additional plugins included,
you will need to enabled whole archeive via The following option.
```bash
--whole-archive libxgboost.a --no-whole-archive
```
Write Your Own Plugin
---------------------
You can plugin your own modules to xgboost by adding code to this folder,
without modification to the main code repo.
The [example](example) folder provides an example to write a plugin.

21
plugin/example/README.md Normal file
View File

@ -0,0 +1,21 @@
XGBoost Plugin Example
======================
This folder provides an example of xgboost plugin.
There are three steps you need to to do to add plugin to xgboost
- Create your source .cc file, implement a new extension
- In this example [custom_obj.cc](custom_obj.cc)
- Register this extension to xgboost via registration macr
- In this example ```XGBOOST_REGISTER_OBJECTIVE``` in [this line](custom_obj.cc#L75)
- Create a [plugin.mk](plugin.mk) on this folder
To add this plugin, add the following line to ```config.mk```(template in make/config.mk).
```makefile
# Add plugin by include the plugin in config
include plugin/example/plugin.mk
```
Then you can test this plugin by using ```objective=mylogistic``` parameter.

View File

@ -0,0 +1,80 @@
/*!
* Copyright 2015 by Contributors
* \file custom_metric.cc
* \brief This is an example to define plugin of xgboost.
* This plugin defines the additional metric function.
*/
#include <xgboost/base.h>
#include <dmlc/parameter.h>
#include <xgboost/objective.h>
namespace xgboost {
namespace obj {
// This is a helpful data structure to define parameters
// You do not have to use it.
// see http://dmlc-core.readthedocs.org/en/latest/parameter.html
// for introduction of this module.
struct MyLogisticParam : public dmlc::Parameter<MyLogisticParam> {
float scale_neg_weight;
// declare parameters
DMLC_DECLARE_PARAMETER(MyLogisticParam) {
DMLC_DECLARE_FIELD(scale_neg_weight).set_default(1.0f).set_lower_bound(0.0f)
.describe("Scale the weight of negative examples by this factor");
}
};
DMLC_REGISTER_PARAMETER(MyLogisticParam);
// Define a customized logistic regression objective in C++.
// Implement the interface.
class MyLogistic : public ObjFunction {
public:
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
param_.InitAllowUnknown(args);
}
void GetGradient(const std::vector<float> &preds,
const MetaInfo &info,
int iter,
std::vector<bst_gpair> *out_gpair) override {
out_gpair->resize(preds.size());
for (size_t i = 0; i < preds.size(); ++i) {
float w = info.GetWeight(i);
// scale the negative examples!
if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight;
// logistic transoformation
float p = 1.0f / (1.0f + expf(-preds[i]));
// this is the gradient
float grad = (p - info.labels[i]) * w;
// this is the second order gradient
float hess = p * (1.0f - p) * w;
out_gpair->at(i) = bst_gpair(grad, hess);
}
}
const char* DefaultEvalMetric() const override {
return "error";
}
void PredTransform(std::vector<float> *io_preds) override {
// transform margin value to probability.
std::vector<float> &preds = *io_preds;
for (size_t i = 0; i < preds.size(); ++i) {
preds[i] = 1.0f / (1.0f + expf(-preds[i]));
}
}
float ProbToMargin(float base_score) const override {
// transform probability to margin value
return -std::log(1.0f / base_score - 1.0f);
}
private:
MyLogisticParam param_;
};
// Finally register the objective function.
// After it succeeds you can try use xgboost with objective=mylogistic
XGBOOST_REGISTER_OBJECTIVE(MyLogistic, "mylogistic")
.describe("User defined logistic regression plugin")
.set_body([]() { return new MyLogistic(); });
} // namespace obj
} // namespace xgboost

4
plugin/example/plugin.mk Normal file
View File

@ -0,0 +1,4 @@
# Add the object files you like to include in this plugin.
PLUGIN_OBJS += build_plugin/example/custom_obj.o
# Add additional dependent libraries this plugin might have
PLUGIN_LDFLAGS +=

2
plugin/lz4/plugin.mk Normal file
View File

@ -0,0 +1,2 @@
PLUGIN_OBJS += build_plugin/lz4/sparse_page_lz4_format.o
PLUGIN_LDFLAGS += -llz4

View File

@ -0,0 +1,281 @@
/*!
* Copyright (c) 2015 by Contributors
* \file sparse_page_lz4_format.cc
* XGBoost Plugin to enable LZ4 compressed format on the external memory pages.
*/
#include <xgboost/data.h>
#include <xgboost/logging.h>
#include <dmlc/registry.h>
#include <lz4.h>
#include <lz4hc.h>
#include "../../src/data/sparse_batch_page.h"
namespace xgboost {
namespace data {
DMLC_REGISTRY_FILE_TAG(sparse_page_lz4_format);
// array to help compression of decompression.
template<typename DType>
class CompressArray {
public:
// the data content.
std::vector<DType> data;
// Decompression helper
// number of chunks
inline int num_chunk() const {
CHECK_GT(raw_chunks_.size(), 1);
return static_cast<int>(raw_chunks_.size() - 1);
}
// raw bytes
inline size_t RawBytes() const {
return raw_chunks_.back() * sizeof(DType);
}
// encoded bytes
inline size_t EncodedBytes() const {
return encoded_chunks_.back() +
(encoded_chunks_.size() + raw_chunks_.size()) * sizeof(bst_uint);
}
// load the array from file.
inline void Read(dmlc::SeekStream* fi);
// run decode on chunk_id
inline void Decompress(int chunk_id);
// Compression helper
// initialize the compression chunks
inline void InitCompressChunks(const std::vector<bst_uint>& chunk_ptr);
// initialize the compression chunks
inline void InitCompressChunks(size_t chunk_size, size_t max_nchunk);
// run decode on chunk_id
inline void Compress(int chunk_id);
// save the output buffer into file.
inline void Write(dmlc::Stream* fo);
private:
// the chunk split of the data, by number of elements
std::vector<bst_uint> raw_chunks_;
// the encoded chunk, by number of bytes
std::vector<bst_uint> encoded_chunks_;
// output buffer of compression.
std::vector<std::string> out_buffer_;
// input buffer of data.
std::string in_buffer_;
};
template<typename DType>
inline void CompressArray<DType>::Read(dmlc::SeekStream* fi) {
CHECK(fi->Read(&raw_chunks_));
CHECK(fi->Read(&encoded_chunks_));
size_t buffer_size = encoded_chunks_.back();
in_buffer_.resize(buffer_size);
CHECK_EQ(fi->Read(dmlc::BeginPtr(in_buffer_), buffer_size), buffer_size);
data.resize(raw_chunks_.back());
}
template<typename DType>
inline void CompressArray<DType>::Decompress(int chunk_id) {
int chunk_size = static_cast<int>(
raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
int encoded_size = static_cast<int>(
encoded_chunks_[chunk_id + 1] - encoded_chunks_[chunk_id]);
// decompress data
int src_size = LZ4_decompress_fast(
dmlc::BeginPtr(in_buffer_) + encoded_chunks_[chunk_id],
reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
chunk_size);
CHECK_EQ(encoded_size, src_size);
}
template<typename DType>
inline void CompressArray<DType>::InitCompressChunks(
const std::vector<bst_uint>& chunk_ptr) {
raw_chunks_ = chunk_ptr;
CHECK_GE(raw_chunks_.size(), 2);
out_buffer_.resize(raw_chunks_.size() - 1);
for (size_t i = 0; i < out_buffer_.size(); ++i) {
out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
}
}
template<typename DType>
inline void CompressArray<DType>::InitCompressChunks(size_t chunk_size, size_t max_nchunk) {
raw_chunks_.clear();
raw_chunks_.push_back(0);
size_t min_chunk_size = data.size() / max_nchunk;
chunk_size = std::max(min_chunk_size, chunk_size);
size_t nstep = data.size() / chunk_size;
for (size_t i = 0; i < nstep; ++i) {
raw_chunks_.push_back(raw_chunks_.back() + chunk_size * i);
}
if (nstep == 0) raw_chunks_.push_back(0);
raw_chunks_.back() = data.size();
CHECK_GE(raw_chunks_.size(), 2);
out_buffer_.resize(raw_chunks_.size() - 1);
for (size_t i = 0; i < out_buffer_.size(); ++i) {
out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
}
}
template<typename DType>
inline void CompressArray<DType>::Compress(int chunk_id) {
CHECK_LT(static_cast<size_t>(chunk_id + 1), raw_chunks_.size());
std::string& buf = out_buffer_[chunk_id];
size_t raw_chunk_size = (raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
int bound = LZ4_compressBound(raw_chunk_size);
CHECK_NE(bound, 0);
buf.resize(bound);
int encoded_size = LZ4_compress_HC(
reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
dmlc::BeginPtr(buf), raw_chunk_size, buf.length(), 9);
CHECK_NE(encoded_size, 0);
CHECK_LE(static_cast<size_t>(encoded_size), buf.length());
buf.resize(encoded_size);
}
template<typename DType>
inline void CompressArray<DType>::Write(dmlc::Stream* fo) {
encoded_chunks_.clear();
encoded_chunks_.push_back(0);
for (size_t i = 0; i < out_buffer_.size(); ++i) {
encoded_chunks_.push_back(encoded_chunks_.back() + out_buffer_[i].length());
}
fo->Write(raw_chunks_);
fo->Write(encoded_chunks_);
for (const std::string& buf : out_buffer_) {
fo->Write(dmlc::BeginPtr(buf), buf.length());
}
}
class SparsePageLZ4Format : public SparsePage::Format {
public:
SparsePageLZ4Format()
: raw_bytes_(0), encoded_bytes_(0) {
nthread_ = 4;
raw_bytes_ = encoded_bytes_ = 0;
}
~SparsePageLZ4Format() {
if (raw_bytes_ != 0) {
LOG(CONSOLE) << "raw_bytes=" << raw_bytes_
<< ", encoded_bytes=" << encoded_bytes_
<< ", ratio=" << double(encoded_bytes_) / raw_bytes_;
}
}
bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
if (!fi->Read(&(page->offset))) return false;
CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file";
this->LoadIndexValue(fi);
page->data.resize(page->offset.back());
CHECK_EQ(index_.data.size(), value_.data.size());
CHECK_EQ(index_.data.size(), page->data.size());
for (size_t i = 0; i < page->data.size(); ++i) {
page->data[i] = SparseBatch::Entry(index_.data[i], value_.data[i]);
}
return true;
}
bool Read(SparsePage* page,
dmlc::SeekStream* fi,
const std::vector<bst_uint>& sorted_index_set) override {
if (!fi->Read(&disk_offset_)) return false;
this->LoadIndexValue(fi);
page->offset.clear();
page->offset.push_back(0);
for (bst_uint cid : sorted_index_set) {
page->offset.push_back(
page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
}
page->data.resize(page->offset.back());
CHECK_EQ(index_.data.size(), value_.data.size());
CHECK_EQ(index_.data.size(), disk_offset_.back());
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
bst_uint cid = sorted_index_set[i];
size_t dst_begin = page->offset[i];
size_t src_begin = disk_offset_[cid];
size_t num = disk_offset_[cid + 1] - disk_offset_[cid];
for (size_t j = 0; j < num; ++j) {
page->data[dst_begin + j] = SparseBatch::Entry(
index_.data[src_begin + j], value_.data[src_begin + j]);
}
}
return true;
}
void Write(const SparsePage& page, dmlc::Stream* fo) override {
CHECK(page.offset.size() != 0 && page.offset[0] == 0);
CHECK_EQ(page.offset.back(), page.data.size());
fo->Write(page.offset);
index_.data.resize(page.data.size());
value_.data.resize(page.data.size());
for (size_t i = 0; i < page.data.size(); ++i) {
index_.data[i] = page.data[i].index;
value_.data[i] = page.data[i].fvalue;
}
index_.InitCompressChunks(kChunkSize, kMaxChunk);
value_.InitCompressChunks(kChunkSize, kMaxChunk);
int nindex = index_.num_chunk();
int nvalue = value_.num_chunk();
int ntotal = nindex + nvalue;
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
for (int i = 0; i < ntotal; ++i) {
if (i < nindex) {
index_.Compress(i);
} else {
value_.Compress(i - nindex);
}
}
index_.Write(fo);
value_.Write(fo);
raw_bytes_ += index_.RawBytes() + value_.RawBytes() + page.offset.size() * sizeof(size_t);
encoded_bytes_ += index_.EncodedBytes() +
value_.EncodedBytes() + page.offset.size() * sizeof(size_t);
}
inline void LoadIndexValue(dmlc::SeekStream* fi) {
index_.Read(fi);
value_.Read(fi);
int nindex = index_.num_chunk();
int nvalue = value_.num_chunk();
int ntotal = nindex + nvalue;
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
for (int i = 0; i < ntotal; ++i) {
if (i < nindex) {
index_.Decompress(i);
} else {
value_.Decompress(i - nindex);
}
}
}
private:
// default chunk size.
static const size_t kChunkSize = 64 << 10UL;
// maximum chunk size.
static const size_t kMaxChunk = 64;
// number of threads
int nthread_;
// raw bytes
size_t raw_bytes_;
// encoded bytes
size_t encoded_bytes_;
/*! \brief external memory column offset */
std::vector<size_t> disk_offset_;
// internal index
CompressArray<bst_uint> index_;
// value set.
CompressArray<bst_float> value_;
};
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4)
.describe("Apply LZ4 binary data compression for ext memory.")
.set_body([]() {
return new SparsePageLZ4Format();
});
} // namespace data
} // namespace xgboost

View File

@ -255,5 +255,8 @@ std::string SparsePage::Format::DecideFormat(const std::string& cache_prefix) {
return "raw";
}
}
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
} // namespace data
} // namespace xgboost

View File

@ -155,7 +155,7 @@ class SparsePage::Format {
* \brief save the data to fo, when a page was written.
* \param fo output stream
*/
virtual void Write(const SparsePage& page, dmlc::Stream* fo) const = 0;
virtual void Write(const SparsePage& page, dmlc::Stream* fo) = 0;
/*!
* \brief Create sparse page of format.
* \return The created format functors.

View File

@ -71,7 +71,7 @@ class SparsePageDMatrix : public DMatrix {
/*! \brief page size 256 MB */
static const size_t kPageSize = 256UL << 20UL;
/*! \brief Maximum number of rows per batch. */
static const size_t kMaxRowPerBatch = 32UL << 10UL;
static const size_t kMaxRowPerBatch = 64UL << 10UL;
private:
// declare the column batch iter.

View File

@ -4,11 +4,14 @@
* Raw binary format of sparse page.
*/
#include <xgboost/data.h>
#include <dmlc/registry.h>
#include "./sparse_batch_page.h"
namespace xgboost {
namespace data {
DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
class SparsePageRawFormat : public SparsePage::Format {
public:
bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
@ -73,7 +76,7 @@ class SparsePageRawFormat : public SparsePage::Format {
return true;
}
void Write(const SparsePage& page, dmlc::Stream* fo) const override {
void Write(const SparsePage& page, dmlc::Stream* fo) override {
CHECK(page.offset.size() != 0 && page.offset[0] == 0);
CHECK_EQ(page.offset.back(), page.data.size());
fo->Write(page.offset);

View File

@ -108,10 +108,10 @@ class RegLossObj : public ObjFunction {
LOG(FATAL) << Loss::LabelErrorMsg();
}
}
virtual const char* DefaultEvalMetric() const {
const char* DefaultEvalMetric() const override {
return Loss::DefaultEvalMetric();
}
virtual void PredTransform(std::vector<float> *io_preds) {
void PredTransform(std::vector<float> *io_preds) override {
std::vector<float> &preds = *io_preds;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
#pragma omp parallel for schedule(static)
@ -119,7 +119,7 @@ class RegLossObj : public ObjFunction {
preds[j] = Loss::PredTransform(preds[j]);
}
}
virtual float ProbToMargin(float base_score) const {
float ProbToMargin(float base_score) const override {
return Loss::ProbToMargin(base_score);
}