[PLUGIN] Add plugin system
This commit is contained in:
32
plugin/README.md
Normal file
32
plugin/README.md
Normal file
@@ -0,0 +1,32 @@
|
||||
XGBoost Plugins Modules
|
||||
=======================
|
||||
This folder contains plugin modules to xgboost that can be optionally installed.
|
||||
The plugin system helps us to extend xgboost with additional features,
|
||||
and add experimental features that may not yet ready to be included in main project.
|
||||
|
||||
To include a certain plugin, say ```plugin_a```, you only need to add the following line to the config.mk.
|
||||
|
||||
```makefile
|
||||
# Add plugin by include the plugin in config
|
||||
include plugin/plugin_a/plugin.mk
|
||||
```
|
||||
|
||||
Then rebuild libxgboost by typing make, you can get a new library with the plugin enabled.
|
||||
|
||||
Link Static XGBoost Library with Plugins
|
||||
----------------------------------------
|
||||
This problem only happens when you link ```libxgboost.a```.
|
||||
If you only use ```libxgboost.so```(this include python and other bindings),
|
||||
you can ignore this section.
|
||||
|
||||
When you want to link ```libxgboost.a``` with additional plugins included,
|
||||
you will need to enabled whole archeive via The following option.
|
||||
```bash
|
||||
--whole-archive libxgboost.a --no-whole-archive
|
||||
```
|
||||
|
||||
Write Your Own Plugin
|
||||
---------------------
|
||||
You can plugin your own modules to xgboost by adding code to this folder,
|
||||
without modification to the main code repo.
|
||||
The [example](example) folder provides an example to write a plugin.
|
||||
21
plugin/example/README.md
Normal file
21
plugin/example/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
XGBoost Plugin Example
|
||||
======================
|
||||
This folder provides an example of xgboost plugin.
|
||||
|
||||
There are three steps you need to to do to add plugin to xgboost
|
||||
- Create your source .cc file, implement a new extension
|
||||
- In this example [custom_obj.cc](custom_obj.cc)
|
||||
- Register this extension to xgboost via registration macr
|
||||
- In this example ```XGBOOST_REGISTER_OBJECTIVE``` in [this line](custom_obj.cc#L75)
|
||||
- Create a [plugin.mk](plugin.mk) on this folder
|
||||
|
||||
To add this plugin, add the following line to ```config.mk```(template in make/config.mk).
|
||||
```makefile
|
||||
# Add plugin by include the plugin in config
|
||||
include plugin/example/plugin.mk
|
||||
```
|
||||
|
||||
Then you can test this plugin by using ```objective=mylogistic``` parameter.
|
||||
|
||||
|
||||
|
||||
80
plugin/example/custom_obj.cc
Normal file
80
plugin/example/custom_obj.cc
Normal file
@@ -0,0 +1,80 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file custom_metric.cc
|
||||
* \brief This is an example to define plugin of xgboost.
|
||||
* This plugin defines the additional metric function.
|
||||
*/
|
||||
#include <xgboost/base.h>
|
||||
#include <dmlc/parameter.h>
|
||||
#include <xgboost/objective.h>
|
||||
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
|
||||
// This is a helpful data structure to define parameters
|
||||
// You do not have to use it.
|
||||
// see http://dmlc-core.readthedocs.org/en/latest/parameter.html
|
||||
// for introduction of this module.
|
||||
struct MyLogisticParam : public dmlc::Parameter<MyLogisticParam> {
|
||||
float scale_neg_weight;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(MyLogisticParam) {
|
||||
DMLC_DECLARE_FIELD(scale_neg_weight).set_default(1.0f).set_lower_bound(0.0f)
|
||||
.describe("Scale the weight of negative examples by this factor");
|
||||
}
|
||||
};
|
||||
|
||||
DMLC_REGISTER_PARAMETER(MyLogisticParam);
|
||||
|
||||
// Define a customized logistic regression objective in C++.
|
||||
// Implement the interface.
|
||||
class MyLogistic : public ObjFunction {
|
||||
public:
|
||||
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
param_.InitAllowUnknown(args);
|
||||
}
|
||||
void GetGradient(const std::vector<float> &preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) override {
|
||||
out_gpair->resize(preds.size());
|
||||
for (size_t i = 0; i < preds.size(); ++i) {
|
||||
float w = info.GetWeight(i);
|
||||
// scale the negative examples!
|
||||
if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight;
|
||||
// logistic transoformation
|
||||
float p = 1.0f / (1.0f + expf(-preds[i]));
|
||||
// this is the gradient
|
||||
float grad = (p - info.labels[i]) * w;
|
||||
// this is the second order gradient
|
||||
float hess = p * (1.0f - p) * w;
|
||||
out_gpair->at(i) = bst_gpair(grad, hess);
|
||||
}
|
||||
}
|
||||
const char* DefaultEvalMetric() const override {
|
||||
return "error";
|
||||
}
|
||||
void PredTransform(std::vector<float> *io_preds) override {
|
||||
// transform margin value to probability.
|
||||
std::vector<float> &preds = *io_preds;
|
||||
for (size_t i = 0; i < preds.size(); ++i) {
|
||||
preds[i] = 1.0f / (1.0f + expf(-preds[i]));
|
||||
}
|
||||
}
|
||||
float ProbToMargin(float base_score) const override {
|
||||
// transform probability to margin value
|
||||
return -std::log(1.0f / base_score - 1.0f);
|
||||
}
|
||||
|
||||
private:
|
||||
MyLogisticParam param_;
|
||||
};
|
||||
|
||||
// Finally register the objective function.
|
||||
// After it succeeds you can try use xgboost with objective=mylogistic
|
||||
XGBOOST_REGISTER_OBJECTIVE(MyLogistic, "mylogistic")
|
||||
.describe("User defined logistic regression plugin")
|
||||
.set_body([]() { return new MyLogistic(); });
|
||||
|
||||
} // namespace obj
|
||||
} // namespace xgboost
|
||||
4
plugin/example/plugin.mk
Normal file
4
plugin/example/plugin.mk
Normal file
@@ -0,0 +1,4 @@
|
||||
# Add the object files you like to include in this plugin.
|
||||
PLUGIN_OBJS += build_plugin/example/custom_obj.o
|
||||
# Add additional dependent libraries this plugin might have
|
||||
PLUGIN_LDFLAGS +=
|
||||
2
plugin/lz4/plugin.mk
Normal file
2
plugin/lz4/plugin.mk
Normal file
@@ -0,0 +1,2 @@
|
||||
PLUGIN_OBJS += build_plugin/lz4/sparse_page_lz4_format.o
|
||||
PLUGIN_LDFLAGS += -llz4
|
||||
281
plugin/lz4/sparse_page_lz4_format.cc
Normal file
281
plugin/lz4/sparse_page_lz4_format.cc
Normal file
@@ -0,0 +1,281 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file sparse_page_lz4_format.cc
|
||||
* XGBoost Plugin to enable LZ4 compressed format on the external memory pages.
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include <lz4.h>
|
||||
#include <lz4hc.h>
|
||||
#include "../../src/data/sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(sparse_page_lz4_format);
|
||||
|
||||
// array to help compression of decompression.
|
||||
template<typename DType>
|
||||
class CompressArray {
|
||||
public:
|
||||
// the data content.
|
||||
std::vector<DType> data;
|
||||
// Decompression helper
|
||||
// number of chunks
|
||||
inline int num_chunk() const {
|
||||
CHECK_GT(raw_chunks_.size(), 1);
|
||||
return static_cast<int>(raw_chunks_.size() - 1);
|
||||
}
|
||||
// raw bytes
|
||||
inline size_t RawBytes() const {
|
||||
return raw_chunks_.back() * sizeof(DType);
|
||||
}
|
||||
// encoded bytes
|
||||
inline size_t EncodedBytes() const {
|
||||
return encoded_chunks_.back() +
|
||||
(encoded_chunks_.size() + raw_chunks_.size()) * sizeof(bst_uint);
|
||||
}
|
||||
// load the array from file.
|
||||
inline void Read(dmlc::SeekStream* fi);
|
||||
// run decode on chunk_id
|
||||
inline void Decompress(int chunk_id);
|
||||
// Compression helper
|
||||
// initialize the compression chunks
|
||||
inline void InitCompressChunks(const std::vector<bst_uint>& chunk_ptr);
|
||||
// initialize the compression chunks
|
||||
inline void InitCompressChunks(size_t chunk_size, size_t max_nchunk);
|
||||
// run decode on chunk_id
|
||||
inline void Compress(int chunk_id);
|
||||
// save the output buffer into file.
|
||||
inline void Write(dmlc::Stream* fo);
|
||||
|
||||
private:
|
||||
// the chunk split of the data, by number of elements
|
||||
std::vector<bst_uint> raw_chunks_;
|
||||
// the encoded chunk, by number of bytes
|
||||
std::vector<bst_uint> encoded_chunks_;
|
||||
// output buffer of compression.
|
||||
std::vector<std::string> out_buffer_;
|
||||
// input buffer of data.
|
||||
std::string in_buffer_;
|
||||
};
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Read(dmlc::SeekStream* fi) {
|
||||
CHECK(fi->Read(&raw_chunks_));
|
||||
CHECK(fi->Read(&encoded_chunks_));
|
||||
size_t buffer_size = encoded_chunks_.back();
|
||||
in_buffer_.resize(buffer_size);
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(in_buffer_), buffer_size), buffer_size);
|
||||
data.resize(raw_chunks_.back());
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Decompress(int chunk_id) {
|
||||
int chunk_size = static_cast<int>(
|
||||
raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
|
||||
int encoded_size = static_cast<int>(
|
||||
encoded_chunks_[chunk_id + 1] - encoded_chunks_[chunk_id]);
|
||||
// decompress data
|
||||
int src_size = LZ4_decompress_fast(
|
||||
dmlc::BeginPtr(in_buffer_) + encoded_chunks_[chunk_id],
|
||||
reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
|
||||
chunk_size);
|
||||
CHECK_EQ(encoded_size, src_size);
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::InitCompressChunks(
|
||||
const std::vector<bst_uint>& chunk_ptr) {
|
||||
raw_chunks_ = chunk_ptr;
|
||||
CHECK_GE(raw_chunks_.size(), 2);
|
||||
out_buffer_.resize(raw_chunks_.size() - 1);
|
||||
for (size_t i = 0; i < out_buffer_.size(); ++i) {
|
||||
out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::InitCompressChunks(size_t chunk_size, size_t max_nchunk) {
|
||||
raw_chunks_.clear();
|
||||
raw_chunks_.push_back(0);
|
||||
size_t min_chunk_size = data.size() / max_nchunk;
|
||||
chunk_size = std::max(min_chunk_size, chunk_size);
|
||||
size_t nstep = data.size() / chunk_size;
|
||||
for (size_t i = 0; i < nstep; ++i) {
|
||||
raw_chunks_.push_back(raw_chunks_.back() + chunk_size * i);
|
||||
}
|
||||
if (nstep == 0) raw_chunks_.push_back(0);
|
||||
raw_chunks_.back() = data.size();
|
||||
CHECK_GE(raw_chunks_.size(), 2);
|
||||
out_buffer_.resize(raw_chunks_.size() - 1);
|
||||
for (size_t i = 0; i < out_buffer_.size(); ++i) {
|
||||
out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Compress(int chunk_id) {
|
||||
CHECK_LT(static_cast<size_t>(chunk_id + 1), raw_chunks_.size());
|
||||
std::string& buf = out_buffer_[chunk_id];
|
||||
size_t raw_chunk_size = (raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
|
||||
int bound = LZ4_compressBound(raw_chunk_size);
|
||||
CHECK_NE(bound, 0);
|
||||
buf.resize(bound);
|
||||
int encoded_size = LZ4_compress_HC(
|
||||
reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
|
||||
dmlc::BeginPtr(buf), raw_chunk_size, buf.length(), 9);
|
||||
CHECK_NE(encoded_size, 0);
|
||||
CHECK_LE(static_cast<size_t>(encoded_size), buf.length());
|
||||
buf.resize(encoded_size);
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Write(dmlc::Stream* fo) {
|
||||
encoded_chunks_.clear();
|
||||
encoded_chunks_.push_back(0);
|
||||
for (size_t i = 0; i < out_buffer_.size(); ++i) {
|
||||
encoded_chunks_.push_back(encoded_chunks_.back() + out_buffer_[i].length());
|
||||
}
|
||||
fo->Write(raw_chunks_);
|
||||
fo->Write(encoded_chunks_);
|
||||
for (const std::string& buf : out_buffer_) {
|
||||
fo->Write(dmlc::BeginPtr(buf), buf.length());
|
||||
}
|
||||
}
|
||||
|
||||
class SparsePageLZ4Format : public SparsePage::Format {
|
||||
public:
|
||||
SparsePageLZ4Format()
|
||||
: raw_bytes_(0), encoded_bytes_(0) {
|
||||
nthread_ = 4;
|
||||
raw_bytes_ = encoded_bytes_ = 0;
|
||||
}
|
||||
~SparsePageLZ4Format() {
|
||||
if (raw_bytes_ != 0) {
|
||||
LOG(CONSOLE) << "raw_bytes=" << raw_bytes_
|
||||
<< ", encoded_bytes=" << encoded_bytes_
|
||||
<< ", ratio=" << double(encoded_bytes_) / raw_bytes_;
|
||||
}
|
||||
}
|
||||
|
||||
bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
|
||||
if (!fi->Read(&(page->offset))) return false;
|
||||
CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file";
|
||||
this->LoadIndexValue(fi);
|
||||
|
||||
page->data.resize(page->offset.back());
|
||||
CHECK_EQ(index_.data.size(), value_.data.size());
|
||||
CHECK_EQ(index_.data.size(), page->data.size());
|
||||
for (size_t i = 0; i < page->data.size(); ++i) {
|
||||
page->data[i] = SparseBatch::Entry(index_.data[i], value_.data[i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Read(SparsePage* page,
|
||||
dmlc::SeekStream* fi,
|
||||
const std::vector<bst_uint>& sorted_index_set) override {
|
||||
if (!fi->Read(&disk_offset_)) return false;
|
||||
this->LoadIndexValue(fi);
|
||||
|
||||
page->offset.clear();
|
||||
page->offset.push_back(0);
|
||||
for (bst_uint cid : sorted_index_set) {
|
||||
page->offset.push_back(
|
||||
page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
|
||||
}
|
||||
page->data.resize(page->offset.back());
|
||||
CHECK_EQ(index_.data.size(), value_.data.size());
|
||||
CHECK_EQ(index_.data.size(), disk_offset_.back());
|
||||
|
||||
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
|
||||
bst_uint cid = sorted_index_set[i];
|
||||
size_t dst_begin = page->offset[i];
|
||||
size_t src_begin = disk_offset_[cid];
|
||||
size_t num = disk_offset_[cid + 1] - disk_offset_[cid];
|
||||
for (size_t j = 0; j < num; ++j) {
|
||||
page->data[dst_begin + j] = SparseBatch::Entry(
|
||||
index_.data[src_begin + j], value_.data[src_begin + j]);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Write(const SparsePage& page, dmlc::Stream* fo) override {
|
||||
CHECK(page.offset.size() != 0 && page.offset[0] == 0);
|
||||
CHECK_EQ(page.offset.back(), page.data.size());
|
||||
fo->Write(page.offset);
|
||||
index_.data.resize(page.data.size());
|
||||
value_.data.resize(page.data.size());
|
||||
|
||||
for (size_t i = 0; i < page.data.size(); ++i) {
|
||||
index_.data[i] = page.data[i].index;
|
||||
value_.data[i] = page.data[i].fvalue;
|
||||
}
|
||||
|
||||
index_.InitCompressChunks(kChunkSize, kMaxChunk);
|
||||
value_.InitCompressChunks(kChunkSize, kMaxChunk);
|
||||
|
||||
int nindex = index_.num_chunk();
|
||||
int nvalue = value_.num_chunk();
|
||||
int ntotal = nindex + nvalue;
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
|
||||
for (int i = 0; i < ntotal; ++i) {
|
||||
if (i < nindex) {
|
||||
index_.Compress(i);
|
||||
} else {
|
||||
value_.Compress(i - nindex);
|
||||
}
|
||||
}
|
||||
index_.Write(fo);
|
||||
value_.Write(fo);
|
||||
raw_bytes_ += index_.RawBytes() + value_.RawBytes() + page.offset.size() * sizeof(size_t);
|
||||
encoded_bytes_ += index_.EncodedBytes() +
|
||||
value_.EncodedBytes() + page.offset.size() * sizeof(size_t);
|
||||
}
|
||||
|
||||
inline void LoadIndexValue(dmlc::SeekStream* fi) {
|
||||
index_.Read(fi);
|
||||
value_.Read(fi);
|
||||
|
||||
int nindex = index_.num_chunk();
|
||||
int nvalue = value_.num_chunk();
|
||||
int ntotal = nindex + nvalue;
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
|
||||
for (int i = 0; i < ntotal; ++i) {
|
||||
if (i < nindex) {
|
||||
index_.Decompress(i);
|
||||
} else {
|
||||
value_.Decompress(i - nindex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// default chunk size.
|
||||
static const size_t kChunkSize = 64 << 10UL;
|
||||
// maximum chunk size.
|
||||
static const size_t kMaxChunk = 64;
|
||||
// number of threads
|
||||
int nthread_;
|
||||
// raw bytes
|
||||
size_t raw_bytes_;
|
||||
// encoded bytes
|
||||
size_t encoded_bytes_;
|
||||
/*! \brief external memory column offset */
|
||||
std::vector<size_t> disk_offset_;
|
||||
// internal index
|
||||
CompressArray<bst_uint> index_;
|
||||
// value set.
|
||||
CompressArray<bst_float> value_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4)
|
||||
.describe("Apply LZ4 binary data compression for ext memory.")
|
||||
.set_body([]() {
|
||||
return new SparsePageLZ4Format();
|
||||
});
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
Reference in New Issue
Block a user