diff --git a/.gitignore b/.gitignore index 97982775a..3ad01e985 100644 --- a/.gitignore +++ b/.gitignore @@ -72,3 +72,4 @@ build config.mk xgboost *.data +build_plugin diff --git a/Makefile b/Makefile index 0243ff553..3528104f4 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,8 @@ +# flags by plugin +PLUGIN_OBJS= +PLUGIN_LDFLAGS= +PLUGIN_CFLAGS= + ifndef config ifneq ("$(wildcard ./config.mk)","") config = config.mk @@ -36,8 +41,8 @@ ifeq ($(OS), Windows_NT) export CC = gcc -m64 endif -export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) -export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS) +export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS) +export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS) CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include #java include path export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java @@ -76,7 +81,7 @@ $(RABIT)/lib/$(LIB_RABIT): java: java/libxgboost4j.so SRC = $(wildcard src/*.cc src/*/*.cc) -ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) +ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) $(PLUGIN_OBJS) AMALGA_OBJ = amalgamation/xgboost-all0.o LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT) ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP) @@ -87,6 +92,11 @@ build/%.o: src/%.cc $(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d $(CXX) -c $(CFLAGS) -c $< -o $@ +build_plugin/%.o: plugin/%.cc + @mkdir -p $(@D) + $(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d + $(CXX) -c $(CFLAGS) -c $< -o $@ + # The should be equivalent to $(ALL_OBJ) except for build/cli_main.o amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc $(CXX) -c $(CFLAGS) -c $< -o $@ @@ -104,20 +114,20 @@ lib/libxgboost.so: $(ALL_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) -java/libxgboost4j.so: java/xgboost4j_wrapper.cpp lib/libxgboost.a $(LIB_DEP) +java/libxgboost4j.so: java/xgboost4j_wrapper.cpp $(ALL_DEP) $(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS) -xgboost: $(CLI_OBJ) lib/libxgboost.a $(LIB_DEP) +xgboost: $(CLI_OBJ) $(ALL_DEP) $(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS) rcpplint: python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src lint: rcpplint - python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src + python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin clean: - $(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ amalgamation/*.o xgboost + $(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ amalgamation/*.o xgboost clean_all: clean cd $(DMLC_CORE); make clean; cd - @@ -157,3 +167,4 @@ Rcheck: -include build/*.d -include build/*/*.d +-include build_plugin/*/*.d diff --git a/make/config.mk b/make/config.mk index 1bf0d7738..d0954fce4 100644 --- a/make/config.mk +++ b/make/config.mk @@ -46,3 +46,9 @@ LIB_RABIT = librabit.a # path to libjvm.so LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server + +# List of additional plugins, checkout plugin folder. +# uncomment the following lines to include these plugins +# you can also add your own plugin like this + +# include plugin/example/plugin.mk diff --git a/plugin/README.md b/plugin/README.md new file mode 100644 index 000000000..445b60f8e --- /dev/null +++ b/plugin/README.md @@ -0,0 +1,32 @@ +XGBoost Plugins Modules +======================= +This folder contains plugin modules to xgboost that can be optionally installed. +The plugin system helps us to extend xgboost with additional features, +and add experimental features that may not yet ready to be included in main project. + +To include a certain plugin, say ```plugin_a```, you only need to add the following line to the config.mk. + +```makefile +# Add plugin by include the plugin in config +include plugin/plugin_a/plugin.mk +``` + +Then rebuild libxgboost by typing make, you can get a new library with the plugin enabled. + +Link Static XGBoost Library with Plugins +---------------------------------------- +This problem only happens when you link ```libxgboost.a```. +If you only use ```libxgboost.so```(this include python and other bindings), +you can ignore this section. + +When you want to link ```libxgboost.a``` with additional plugins included, +you will need to enabled whole archeive via The following option. +```bash +--whole-archive libxgboost.a --no-whole-archive +``` + +Write Your Own Plugin +--------------------- +You can plugin your own modules to xgboost by adding code to this folder, +without modification to the main code repo. +The [example](example) folder provides an example to write a plugin. diff --git a/plugin/example/README.md b/plugin/example/README.md new file mode 100644 index 000000000..7a3eb6685 --- /dev/null +++ b/plugin/example/README.md @@ -0,0 +1,21 @@ +XGBoost Plugin Example +====================== +This folder provides an example of xgboost plugin. + +There are three steps you need to to do to add plugin to xgboost +- Create your source .cc file, implement a new extension + - In this example [custom_obj.cc](custom_obj.cc) +- Register this extension to xgboost via registration macr + - In this example ```XGBOOST_REGISTER_OBJECTIVE``` in [this line](custom_obj.cc#L75) +- Create a [plugin.mk](plugin.mk) on this folder + +To add this plugin, add the following line to ```config.mk```(template in make/config.mk). +```makefile +# Add plugin by include the plugin in config +include plugin/example/plugin.mk +``` + +Then you can test this plugin by using ```objective=mylogistic``` parameter. + + + diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc new file mode 100644 index 000000000..95384f21d --- /dev/null +++ b/plugin/example/custom_obj.cc @@ -0,0 +1,80 @@ +/*! + * Copyright 2015 by Contributors + * \file custom_metric.cc + * \brief This is an example to define plugin of xgboost. + * This plugin defines the additional metric function. + */ +#include +#include +#include + +namespace xgboost { +namespace obj { + +// This is a helpful data structure to define parameters +// You do not have to use it. +// see http://dmlc-core.readthedocs.org/en/latest/parameter.html +// for introduction of this module. +struct MyLogisticParam : public dmlc::Parameter { + float scale_neg_weight; + // declare parameters + DMLC_DECLARE_PARAMETER(MyLogisticParam) { + DMLC_DECLARE_FIELD(scale_neg_weight).set_default(1.0f).set_lower_bound(0.0f) + .describe("Scale the weight of negative examples by this factor"); + } +}; + +DMLC_REGISTER_PARAMETER(MyLogisticParam); + +// Define a customized logistic regression objective in C++. +// Implement the interface. +class MyLogistic : public ObjFunction { + public: + void Configure(const std::vector >& args) override { + param_.InitAllowUnknown(args); + } + void GetGradient(const std::vector &preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) override { + out_gpair->resize(preds.size()); + for (size_t i = 0; i < preds.size(); ++i) { + float w = info.GetWeight(i); + // scale the negative examples! + if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight; + // logistic transoformation + float p = 1.0f / (1.0f + expf(-preds[i])); + // this is the gradient + float grad = (p - info.labels[i]) * w; + // this is the second order gradient + float hess = p * (1.0f - p) * w; + out_gpair->at(i) = bst_gpair(grad, hess); + } + } + const char* DefaultEvalMetric() const override { + return "error"; + } + void PredTransform(std::vector *io_preds) override { + // transform margin value to probability. + std::vector &preds = *io_preds; + for (size_t i = 0; i < preds.size(); ++i) { + preds[i] = 1.0f / (1.0f + expf(-preds[i])); + } + } + float ProbToMargin(float base_score) const override { + // transform probability to margin value + return -std::log(1.0f / base_score - 1.0f); + } + + private: + MyLogisticParam param_; +}; + +// Finally register the objective function. +// After it succeeds you can try use xgboost with objective=mylogistic +XGBOOST_REGISTER_OBJECTIVE(MyLogistic, "mylogistic") +.describe("User defined logistic regression plugin") +.set_body([]() { return new MyLogistic(); }); + +} // namespace obj +} // namespace xgboost diff --git a/plugin/example/plugin.mk b/plugin/example/plugin.mk new file mode 100644 index 000000000..8ebd26d61 --- /dev/null +++ b/plugin/example/plugin.mk @@ -0,0 +1,4 @@ +# Add the object files you like to include in this plugin. +PLUGIN_OBJS += build_plugin/example/custom_obj.o +# Add additional dependent libraries this plugin might have +PLUGIN_LDFLAGS += \ No newline at end of file diff --git a/plugin/lz4/plugin.mk b/plugin/lz4/plugin.mk new file mode 100644 index 000000000..7a69027c7 --- /dev/null +++ b/plugin/lz4/plugin.mk @@ -0,0 +1,2 @@ +PLUGIN_OBJS += build_plugin/lz4/sparse_page_lz4_format.o +PLUGIN_LDFLAGS += -llz4 diff --git a/plugin/lz4/sparse_page_lz4_format.cc b/plugin/lz4/sparse_page_lz4_format.cc new file mode 100644 index 000000000..11d80defe --- /dev/null +++ b/plugin/lz4/sparse_page_lz4_format.cc @@ -0,0 +1,281 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file sparse_page_lz4_format.cc + * XGBoost Plugin to enable LZ4 compressed format on the external memory pages. + */ +#include +#include +#include +#include +#include +#include "../../src/data/sparse_batch_page.h" + +namespace xgboost { +namespace data { + +DMLC_REGISTRY_FILE_TAG(sparse_page_lz4_format); + +// array to help compression of decompression. +template +class CompressArray { + public: + // the data content. + std::vector data; + // Decompression helper + // number of chunks + inline int num_chunk() const { + CHECK_GT(raw_chunks_.size(), 1); + return static_cast(raw_chunks_.size() - 1); + } + // raw bytes + inline size_t RawBytes() const { + return raw_chunks_.back() * sizeof(DType); + } + // encoded bytes + inline size_t EncodedBytes() const { + return encoded_chunks_.back() + + (encoded_chunks_.size() + raw_chunks_.size()) * sizeof(bst_uint); + } + // load the array from file. + inline void Read(dmlc::SeekStream* fi); + // run decode on chunk_id + inline void Decompress(int chunk_id); + // Compression helper + // initialize the compression chunks + inline void InitCompressChunks(const std::vector& chunk_ptr); + // initialize the compression chunks + inline void InitCompressChunks(size_t chunk_size, size_t max_nchunk); + // run decode on chunk_id + inline void Compress(int chunk_id); + // save the output buffer into file. + inline void Write(dmlc::Stream* fo); + + private: + // the chunk split of the data, by number of elements + std::vector raw_chunks_; + // the encoded chunk, by number of bytes + std::vector encoded_chunks_; + // output buffer of compression. + std::vector out_buffer_; + // input buffer of data. + std::string in_buffer_; +}; + +template +inline void CompressArray::Read(dmlc::SeekStream* fi) { + CHECK(fi->Read(&raw_chunks_)); + CHECK(fi->Read(&encoded_chunks_)); + size_t buffer_size = encoded_chunks_.back(); + in_buffer_.resize(buffer_size); + CHECK_EQ(fi->Read(dmlc::BeginPtr(in_buffer_), buffer_size), buffer_size); + data.resize(raw_chunks_.back()); +} + +template +inline void CompressArray::Decompress(int chunk_id) { + int chunk_size = static_cast( + raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType); + int encoded_size = static_cast( + encoded_chunks_[chunk_id + 1] - encoded_chunks_[chunk_id]); + // decompress data + int src_size = LZ4_decompress_fast( + dmlc::BeginPtr(in_buffer_) + encoded_chunks_[chunk_id], + reinterpret_cast(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]), + chunk_size); + CHECK_EQ(encoded_size, src_size); +} + +template +inline void CompressArray::InitCompressChunks( + const std::vector& chunk_ptr) { + raw_chunks_ = chunk_ptr; + CHECK_GE(raw_chunks_.size(), 2); + out_buffer_.resize(raw_chunks_.size() - 1); + for (size_t i = 0; i < out_buffer_.size(); ++i) { + out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]); + } +} + +template +inline void CompressArray::InitCompressChunks(size_t chunk_size, size_t max_nchunk) { + raw_chunks_.clear(); + raw_chunks_.push_back(0); + size_t min_chunk_size = data.size() / max_nchunk; + chunk_size = std::max(min_chunk_size, chunk_size); + size_t nstep = data.size() / chunk_size; + for (size_t i = 0; i < nstep; ++i) { + raw_chunks_.push_back(raw_chunks_.back() + chunk_size * i); + } + if (nstep == 0) raw_chunks_.push_back(0); + raw_chunks_.back() = data.size(); + CHECK_GE(raw_chunks_.size(), 2); + out_buffer_.resize(raw_chunks_.size() - 1); + for (size_t i = 0; i < out_buffer_.size(); ++i) { + out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]); + } +} + +template +inline void CompressArray::Compress(int chunk_id) { + CHECK_LT(static_cast(chunk_id + 1), raw_chunks_.size()); + std::string& buf = out_buffer_[chunk_id]; + size_t raw_chunk_size = (raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType); + int bound = LZ4_compressBound(raw_chunk_size); + CHECK_NE(bound, 0); + buf.resize(bound); + int encoded_size = LZ4_compress_HC( + reinterpret_cast(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]), + dmlc::BeginPtr(buf), raw_chunk_size, buf.length(), 9); + CHECK_NE(encoded_size, 0); + CHECK_LE(static_cast(encoded_size), buf.length()); + buf.resize(encoded_size); +} + +template +inline void CompressArray::Write(dmlc::Stream* fo) { + encoded_chunks_.clear(); + encoded_chunks_.push_back(0); + for (size_t i = 0; i < out_buffer_.size(); ++i) { + encoded_chunks_.push_back(encoded_chunks_.back() + out_buffer_[i].length()); + } + fo->Write(raw_chunks_); + fo->Write(encoded_chunks_); + for (const std::string& buf : out_buffer_) { + fo->Write(dmlc::BeginPtr(buf), buf.length()); + } +} + +class SparsePageLZ4Format : public SparsePage::Format { + public: + SparsePageLZ4Format() + : raw_bytes_(0), encoded_bytes_(0) { + nthread_ = 4; + raw_bytes_ = encoded_bytes_ = 0; + } + ~SparsePageLZ4Format() { + if (raw_bytes_ != 0) { + LOG(CONSOLE) << "raw_bytes=" << raw_bytes_ + << ", encoded_bytes=" << encoded_bytes_ + << ", ratio=" << double(encoded_bytes_) / raw_bytes_; + } + } + + bool Read(SparsePage* page, dmlc::SeekStream* fi) override { + if (!fi->Read(&(page->offset))) return false; + CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file"; + this->LoadIndexValue(fi); + + page->data.resize(page->offset.back()); + CHECK_EQ(index_.data.size(), value_.data.size()); + CHECK_EQ(index_.data.size(), page->data.size()); + for (size_t i = 0; i < page->data.size(); ++i) { + page->data[i] = SparseBatch::Entry(index_.data[i], value_.data[i]); + } + return true; + } + + bool Read(SparsePage* page, + dmlc::SeekStream* fi, + const std::vector& sorted_index_set) override { + if (!fi->Read(&disk_offset_)) return false; + this->LoadIndexValue(fi); + + page->offset.clear(); + page->offset.push_back(0); + for (bst_uint cid : sorted_index_set) { + page->offset.push_back( + page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]); + } + page->data.resize(page->offset.back()); + CHECK_EQ(index_.data.size(), value_.data.size()); + CHECK_EQ(index_.data.size(), disk_offset_.back()); + + for (size_t i = 0; i < sorted_index_set.size(); ++i) { + bst_uint cid = sorted_index_set[i]; + size_t dst_begin = page->offset[i]; + size_t src_begin = disk_offset_[cid]; + size_t num = disk_offset_[cid + 1] - disk_offset_[cid]; + for (size_t j = 0; j < num; ++j) { + page->data[dst_begin + j] = SparseBatch::Entry( + index_.data[src_begin + j], value_.data[src_begin + j]); + } + } + return true; + } + + void Write(const SparsePage& page, dmlc::Stream* fo) override { + CHECK(page.offset.size() != 0 && page.offset[0] == 0); + CHECK_EQ(page.offset.back(), page.data.size()); + fo->Write(page.offset); + index_.data.resize(page.data.size()); + value_.data.resize(page.data.size()); + + for (size_t i = 0; i < page.data.size(); ++i) { + index_.data[i] = page.data[i].index; + value_.data[i] = page.data[i].fvalue; + } + + index_.InitCompressChunks(kChunkSize, kMaxChunk); + value_.InitCompressChunks(kChunkSize, kMaxChunk); + + int nindex = index_.num_chunk(); + int nvalue = value_.num_chunk(); + int ntotal = nindex + nvalue; + #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_) + for (int i = 0; i < ntotal; ++i) { + if (i < nindex) { + index_.Compress(i); + } else { + value_.Compress(i - nindex); + } + } + index_.Write(fo); + value_.Write(fo); + raw_bytes_ += index_.RawBytes() + value_.RawBytes() + page.offset.size() * sizeof(size_t); + encoded_bytes_ += index_.EncodedBytes() + + value_.EncodedBytes() + page.offset.size() * sizeof(size_t); + } + + inline void LoadIndexValue(dmlc::SeekStream* fi) { + index_.Read(fi); + value_.Read(fi); + + int nindex = index_.num_chunk(); + int nvalue = value_.num_chunk(); + int ntotal = nindex + nvalue; + #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_) + for (int i = 0; i < ntotal; ++i) { + if (i < nindex) { + index_.Decompress(i); + } else { + value_.Decompress(i - nindex); + } + } + } + + private: + // default chunk size. + static const size_t kChunkSize = 64 << 10UL; + // maximum chunk size. + static const size_t kMaxChunk = 64; + // number of threads + int nthread_; + // raw bytes + size_t raw_bytes_; + // encoded bytes + size_t encoded_bytes_; + /*! \brief external memory column offset */ + std::vector disk_offset_; + // internal index + CompressArray index_; + // value set. + CompressArray value_; +}; + +XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4) +.describe("Apply LZ4 binary data compression for ext memory.") +.set_body([]() { + return new SparsePageLZ4Format(); + }); +} // namespace data +} // namespace xgboost diff --git a/src/data/data.cc b/src/data/data.cc index 30da58e8e..d3c530c32 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -255,5 +255,8 @@ std::string SparsePage::Format::DecideFormat(const std::string& cache_prefix) { return "raw"; } } + +// List of files that will be force linked in static links. +DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format); } // namespace data } // namespace xgboost diff --git a/src/data/sparse_batch_page.h b/src/data/sparse_batch_page.h index 78534bfd6..680b255c2 100644 --- a/src/data/sparse_batch_page.h +++ b/src/data/sparse_batch_page.h @@ -155,7 +155,7 @@ class SparsePage::Format { * \brief save the data to fo, when a page was written. * \param fo output stream */ - virtual void Write(const SparsePage& page, dmlc::Stream* fo) const = 0; + virtual void Write(const SparsePage& page, dmlc::Stream* fo) = 0; /*! * \brief Create sparse page of format. * \return The created format functors. diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h index 9d0a2e344..e4aebee9c 100644 --- a/src/data/sparse_page_dmatrix.h +++ b/src/data/sparse_page_dmatrix.h @@ -71,7 +71,7 @@ class SparsePageDMatrix : public DMatrix { /*! \brief page size 256 MB */ static const size_t kPageSize = 256UL << 20UL; /*! \brief Maximum number of rows per batch. */ - static const size_t kMaxRowPerBatch = 32UL << 10UL; + static const size_t kMaxRowPerBatch = 64UL << 10UL; private: // declare the column batch iter. diff --git a/src/data/sparse_page_raw_format.cc b/src/data/sparse_page_raw_format.cc index 867ffad1c..d0019fde6 100644 --- a/src/data/sparse_page_raw_format.cc +++ b/src/data/sparse_page_raw_format.cc @@ -4,11 +4,14 @@ * Raw binary format of sparse page. */ #include +#include #include "./sparse_batch_page.h" namespace xgboost { namespace data { +DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format); + class SparsePageRawFormat : public SparsePage::Format { public: bool Read(SparsePage* page, dmlc::SeekStream* fi) override { @@ -73,7 +76,7 @@ class SparsePageRawFormat : public SparsePage::Format { return true; } - void Write(const SparsePage& page, dmlc::Stream* fo) const override { + void Write(const SparsePage& page, dmlc::Stream* fo) override { CHECK(page.offset.size() != 0 && page.offset[0] == 0); CHECK_EQ(page.offset.back(), page.data.size()); fo->Write(page.offset); diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc index bf2a08e1a..6eb0f0a78 100644 --- a/src/objective/regression_obj.cc +++ b/src/objective/regression_obj.cc @@ -108,10 +108,10 @@ class RegLossObj : public ObjFunction { LOG(FATAL) << Loss::LabelErrorMsg(); } } - virtual const char* DefaultEvalMetric() const { + const char* DefaultEvalMetric() const override { return Loss::DefaultEvalMetric(); } - virtual void PredTransform(std::vector *io_preds) { + void PredTransform(std::vector *io_preds) override { std::vector &preds = *io_preds; const bst_omp_uint ndata = static_cast(preds.size()); #pragma omp parallel for schedule(static) @@ -119,7 +119,7 @@ class RegLossObj : public ObjFunction { preds[j] = Loss::PredTransform(preds[j]); } } - virtual float ProbToMargin(float base_score) const { + float ProbToMargin(float base_score) const override { return Loss::ProbToMargin(base_score); }