[PLUGIN] Add plugin system
This commit is contained in:
parent
36c389ac46
commit
96f4542a67
1
.gitignore
vendored
1
.gitignore
vendored
@ -72,3 +72,4 @@ build
|
||||
config.mk
|
||||
xgboost
|
||||
*.data
|
||||
build_plugin
|
||||
|
||||
25
Makefile
25
Makefile
@ -1,3 +1,8 @@
|
||||
# flags by plugin
|
||||
PLUGIN_OBJS=
|
||||
PLUGIN_LDFLAGS=
|
||||
PLUGIN_CFLAGS=
|
||||
|
||||
ifndef config
|
||||
ifneq ("$(wildcard ./config.mk)","")
|
||||
config = config.mk
|
||||
@ -36,8 +41,8 @@ ifeq ($(OS), Windows_NT)
|
||||
export CC = gcc -m64
|
||||
endif
|
||||
|
||||
export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS)
|
||||
export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS)
|
||||
export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) $(PLUGIN_LDFLAGS)
|
||||
export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS) $(PLUGIN_CFLAGS)
|
||||
CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include
|
||||
#java include path
|
||||
export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
|
||||
@ -76,7 +81,7 @@ $(RABIT)/lib/$(LIB_RABIT):
|
||||
java: java/libxgboost4j.so
|
||||
|
||||
SRC = $(wildcard src/*.cc src/*/*.cc)
|
||||
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
|
||||
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) $(PLUGIN_OBJS)
|
||||
AMALGA_OBJ = amalgamation/xgboost-all0.o
|
||||
LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)
|
||||
ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP)
|
||||
@ -87,6 +92,11 @@ build/%.o: src/%.cc
|
||||
$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
|
||||
$(CXX) -c $(CFLAGS) -c $< -o $@
|
||||
|
||||
build_plugin/%.o: plugin/%.cc
|
||||
@mkdir -p $(@D)
|
||||
$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
|
||||
$(CXX) -c $(CFLAGS) -c $< -o $@
|
||||
|
||||
# The should be equivalent to $(ALL_OBJ) except for build/cli_main.o
|
||||
amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
|
||||
$(CXX) -c $(CFLAGS) -c $< -o $@
|
||||
@ -104,20 +114,20 @@ lib/libxgboost.so: $(ALL_DEP)
|
||||
@mkdir -p $(@D)
|
||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||
|
||||
java/libxgboost4j.so: java/xgboost4j_wrapper.cpp lib/libxgboost.a $(LIB_DEP)
|
||||
java/libxgboost4j.so: java/xgboost4j_wrapper.cpp $(ALL_DEP)
|
||||
$(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS)
|
||||
|
||||
xgboost: $(CLI_OBJ) lib/libxgboost.a $(LIB_DEP)
|
||||
xgboost: $(CLI_OBJ) $(ALL_DEP)
|
||||
$(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||
|
||||
rcpplint:
|
||||
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src
|
||||
|
||||
lint: rcpplint
|
||||
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src
|
||||
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin
|
||||
|
||||
clean:
|
||||
$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ amalgamation/*.o xgboost
|
||||
$(RM) -rf build build_plugin lib bin *~ */*~ */*/*~ */*/*/*~ amalgamation/*.o xgboost
|
||||
|
||||
clean_all: clean
|
||||
cd $(DMLC_CORE); make clean; cd -
|
||||
@ -157,3 +167,4 @@ Rcheck:
|
||||
|
||||
-include build/*.d
|
||||
-include build/*/*.d
|
||||
-include build_plugin/*/*.d
|
||||
|
||||
@ -46,3 +46,9 @@ LIB_RABIT = librabit.a
|
||||
|
||||
# path to libjvm.so
|
||||
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
|
||||
|
||||
# List of additional plugins, checkout plugin folder.
|
||||
# uncomment the following lines to include these plugins
|
||||
# you can also add your own plugin like this
|
||||
|
||||
# include plugin/example/plugin.mk
|
||||
|
||||
32
plugin/README.md
Normal file
32
plugin/README.md
Normal file
@ -0,0 +1,32 @@
|
||||
XGBoost Plugins Modules
|
||||
=======================
|
||||
This folder contains plugin modules to xgboost that can be optionally installed.
|
||||
The plugin system helps us to extend xgboost with additional features,
|
||||
and add experimental features that may not yet ready to be included in main project.
|
||||
|
||||
To include a certain plugin, say ```plugin_a```, you only need to add the following line to the config.mk.
|
||||
|
||||
```makefile
|
||||
# Add plugin by include the plugin in config
|
||||
include plugin/plugin_a/plugin.mk
|
||||
```
|
||||
|
||||
Then rebuild libxgboost by typing make, you can get a new library with the plugin enabled.
|
||||
|
||||
Link Static XGBoost Library with Plugins
|
||||
----------------------------------------
|
||||
This problem only happens when you link ```libxgboost.a```.
|
||||
If you only use ```libxgboost.so```(this include python and other bindings),
|
||||
you can ignore this section.
|
||||
|
||||
When you want to link ```libxgboost.a``` with additional plugins included,
|
||||
you will need to enabled whole archeive via The following option.
|
||||
```bash
|
||||
--whole-archive libxgboost.a --no-whole-archive
|
||||
```
|
||||
|
||||
Write Your Own Plugin
|
||||
---------------------
|
||||
You can plugin your own modules to xgboost by adding code to this folder,
|
||||
without modification to the main code repo.
|
||||
The [example](example) folder provides an example to write a plugin.
|
||||
21
plugin/example/README.md
Normal file
21
plugin/example/README.md
Normal file
@ -0,0 +1,21 @@
|
||||
XGBoost Plugin Example
|
||||
======================
|
||||
This folder provides an example of xgboost plugin.
|
||||
|
||||
There are three steps you need to to do to add plugin to xgboost
|
||||
- Create your source .cc file, implement a new extension
|
||||
- In this example [custom_obj.cc](custom_obj.cc)
|
||||
- Register this extension to xgboost via registration macr
|
||||
- In this example ```XGBOOST_REGISTER_OBJECTIVE``` in [this line](custom_obj.cc#L75)
|
||||
- Create a [plugin.mk](plugin.mk) on this folder
|
||||
|
||||
To add this plugin, add the following line to ```config.mk```(template in make/config.mk).
|
||||
```makefile
|
||||
# Add plugin by include the plugin in config
|
||||
include plugin/example/plugin.mk
|
||||
```
|
||||
|
||||
Then you can test this plugin by using ```objective=mylogistic``` parameter.
|
||||
|
||||
|
||||
|
||||
80
plugin/example/custom_obj.cc
Normal file
80
plugin/example/custom_obj.cc
Normal file
@ -0,0 +1,80 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file custom_metric.cc
|
||||
* \brief This is an example to define plugin of xgboost.
|
||||
* This plugin defines the additional metric function.
|
||||
*/
|
||||
#include <xgboost/base.h>
|
||||
#include <dmlc/parameter.h>
|
||||
#include <xgboost/objective.h>
|
||||
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
|
||||
// This is a helpful data structure to define parameters
|
||||
// You do not have to use it.
|
||||
// see http://dmlc-core.readthedocs.org/en/latest/parameter.html
|
||||
// for introduction of this module.
|
||||
struct MyLogisticParam : public dmlc::Parameter<MyLogisticParam> {
|
||||
float scale_neg_weight;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(MyLogisticParam) {
|
||||
DMLC_DECLARE_FIELD(scale_neg_weight).set_default(1.0f).set_lower_bound(0.0f)
|
||||
.describe("Scale the weight of negative examples by this factor");
|
||||
}
|
||||
};
|
||||
|
||||
DMLC_REGISTER_PARAMETER(MyLogisticParam);
|
||||
|
||||
// Define a customized logistic regression objective in C++.
|
||||
// Implement the interface.
|
||||
class MyLogistic : public ObjFunction {
|
||||
public:
|
||||
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
param_.InitAllowUnknown(args);
|
||||
}
|
||||
void GetGradient(const std::vector<float> &preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) override {
|
||||
out_gpair->resize(preds.size());
|
||||
for (size_t i = 0; i < preds.size(); ++i) {
|
||||
float w = info.GetWeight(i);
|
||||
// scale the negative examples!
|
||||
if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight;
|
||||
// logistic transoformation
|
||||
float p = 1.0f / (1.0f + expf(-preds[i]));
|
||||
// this is the gradient
|
||||
float grad = (p - info.labels[i]) * w;
|
||||
// this is the second order gradient
|
||||
float hess = p * (1.0f - p) * w;
|
||||
out_gpair->at(i) = bst_gpair(grad, hess);
|
||||
}
|
||||
}
|
||||
const char* DefaultEvalMetric() const override {
|
||||
return "error";
|
||||
}
|
||||
void PredTransform(std::vector<float> *io_preds) override {
|
||||
// transform margin value to probability.
|
||||
std::vector<float> &preds = *io_preds;
|
||||
for (size_t i = 0; i < preds.size(); ++i) {
|
||||
preds[i] = 1.0f / (1.0f + expf(-preds[i]));
|
||||
}
|
||||
}
|
||||
float ProbToMargin(float base_score) const override {
|
||||
// transform probability to margin value
|
||||
return -std::log(1.0f / base_score - 1.0f);
|
||||
}
|
||||
|
||||
private:
|
||||
MyLogisticParam param_;
|
||||
};
|
||||
|
||||
// Finally register the objective function.
|
||||
// After it succeeds you can try use xgboost with objective=mylogistic
|
||||
XGBOOST_REGISTER_OBJECTIVE(MyLogistic, "mylogistic")
|
||||
.describe("User defined logistic regression plugin")
|
||||
.set_body([]() { return new MyLogistic(); });
|
||||
|
||||
} // namespace obj
|
||||
} // namespace xgboost
|
||||
4
plugin/example/plugin.mk
Normal file
4
plugin/example/plugin.mk
Normal file
@ -0,0 +1,4 @@
|
||||
# Add the object files you like to include in this plugin.
|
||||
PLUGIN_OBJS += build_plugin/example/custom_obj.o
|
||||
# Add additional dependent libraries this plugin might have
|
||||
PLUGIN_LDFLAGS +=
|
||||
2
plugin/lz4/plugin.mk
Normal file
2
plugin/lz4/plugin.mk
Normal file
@ -0,0 +1,2 @@
|
||||
PLUGIN_OBJS += build_plugin/lz4/sparse_page_lz4_format.o
|
||||
PLUGIN_LDFLAGS += -llz4
|
||||
281
plugin/lz4/sparse_page_lz4_format.cc
Normal file
281
plugin/lz4/sparse_page_lz4_format.cc
Normal file
@ -0,0 +1,281 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file sparse_page_lz4_format.cc
|
||||
* XGBoost Plugin to enable LZ4 compressed format on the external memory pages.
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include <lz4.h>
|
||||
#include <lz4hc.h>
|
||||
#include "../../src/data/sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(sparse_page_lz4_format);
|
||||
|
||||
// array to help compression of decompression.
|
||||
template<typename DType>
|
||||
class CompressArray {
|
||||
public:
|
||||
// the data content.
|
||||
std::vector<DType> data;
|
||||
// Decompression helper
|
||||
// number of chunks
|
||||
inline int num_chunk() const {
|
||||
CHECK_GT(raw_chunks_.size(), 1);
|
||||
return static_cast<int>(raw_chunks_.size() - 1);
|
||||
}
|
||||
// raw bytes
|
||||
inline size_t RawBytes() const {
|
||||
return raw_chunks_.back() * sizeof(DType);
|
||||
}
|
||||
// encoded bytes
|
||||
inline size_t EncodedBytes() const {
|
||||
return encoded_chunks_.back() +
|
||||
(encoded_chunks_.size() + raw_chunks_.size()) * sizeof(bst_uint);
|
||||
}
|
||||
// load the array from file.
|
||||
inline void Read(dmlc::SeekStream* fi);
|
||||
// run decode on chunk_id
|
||||
inline void Decompress(int chunk_id);
|
||||
// Compression helper
|
||||
// initialize the compression chunks
|
||||
inline void InitCompressChunks(const std::vector<bst_uint>& chunk_ptr);
|
||||
// initialize the compression chunks
|
||||
inline void InitCompressChunks(size_t chunk_size, size_t max_nchunk);
|
||||
// run decode on chunk_id
|
||||
inline void Compress(int chunk_id);
|
||||
// save the output buffer into file.
|
||||
inline void Write(dmlc::Stream* fo);
|
||||
|
||||
private:
|
||||
// the chunk split of the data, by number of elements
|
||||
std::vector<bst_uint> raw_chunks_;
|
||||
// the encoded chunk, by number of bytes
|
||||
std::vector<bst_uint> encoded_chunks_;
|
||||
// output buffer of compression.
|
||||
std::vector<std::string> out_buffer_;
|
||||
// input buffer of data.
|
||||
std::string in_buffer_;
|
||||
};
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Read(dmlc::SeekStream* fi) {
|
||||
CHECK(fi->Read(&raw_chunks_));
|
||||
CHECK(fi->Read(&encoded_chunks_));
|
||||
size_t buffer_size = encoded_chunks_.back();
|
||||
in_buffer_.resize(buffer_size);
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(in_buffer_), buffer_size), buffer_size);
|
||||
data.resize(raw_chunks_.back());
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Decompress(int chunk_id) {
|
||||
int chunk_size = static_cast<int>(
|
||||
raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
|
||||
int encoded_size = static_cast<int>(
|
||||
encoded_chunks_[chunk_id + 1] - encoded_chunks_[chunk_id]);
|
||||
// decompress data
|
||||
int src_size = LZ4_decompress_fast(
|
||||
dmlc::BeginPtr(in_buffer_) + encoded_chunks_[chunk_id],
|
||||
reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
|
||||
chunk_size);
|
||||
CHECK_EQ(encoded_size, src_size);
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::InitCompressChunks(
|
||||
const std::vector<bst_uint>& chunk_ptr) {
|
||||
raw_chunks_ = chunk_ptr;
|
||||
CHECK_GE(raw_chunks_.size(), 2);
|
||||
out_buffer_.resize(raw_chunks_.size() - 1);
|
||||
for (size_t i = 0; i < out_buffer_.size(); ++i) {
|
||||
out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::InitCompressChunks(size_t chunk_size, size_t max_nchunk) {
|
||||
raw_chunks_.clear();
|
||||
raw_chunks_.push_back(0);
|
||||
size_t min_chunk_size = data.size() / max_nchunk;
|
||||
chunk_size = std::max(min_chunk_size, chunk_size);
|
||||
size_t nstep = data.size() / chunk_size;
|
||||
for (size_t i = 0; i < nstep; ++i) {
|
||||
raw_chunks_.push_back(raw_chunks_.back() + chunk_size * i);
|
||||
}
|
||||
if (nstep == 0) raw_chunks_.push_back(0);
|
||||
raw_chunks_.back() = data.size();
|
||||
CHECK_GE(raw_chunks_.size(), 2);
|
||||
out_buffer_.resize(raw_chunks_.size() - 1);
|
||||
for (size_t i = 0; i < out_buffer_.size(); ++i) {
|
||||
out_buffer_[i].resize(raw_chunks_[i + 1] - raw_chunks_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Compress(int chunk_id) {
|
||||
CHECK_LT(static_cast<size_t>(chunk_id + 1), raw_chunks_.size());
|
||||
std::string& buf = out_buffer_[chunk_id];
|
||||
size_t raw_chunk_size = (raw_chunks_[chunk_id + 1] - raw_chunks_[chunk_id]) * sizeof(DType);
|
||||
int bound = LZ4_compressBound(raw_chunk_size);
|
||||
CHECK_NE(bound, 0);
|
||||
buf.resize(bound);
|
||||
int encoded_size = LZ4_compress_HC(
|
||||
reinterpret_cast<char*>(dmlc::BeginPtr(data) + raw_chunks_[chunk_id]),
|
||||
dmlc::BeginPtr(buf), raw_chunk_size, buf.length(), 9);
|
||||
CHECK_NE(encoded_size, 0);
|
||||
CHECK_LE(static_cast<size_t>(encoded_size), buf.length());
|
||||
buf.resize(encoded_size);
|
||||
}
|
||||
|
||||
template<typename DType>
|
||||
inline void CompressArray<DType>::Write(dmlc::Stream* fo) {
|
||||
encoded_chunks_.clear();
|
||||
encoded_chunks_.push_back(0);
|
||||
for (size_t i = 0; i < out_buffer_.size(); ++i) {
|
||||
encoded_chunks_.push_back(encoded_chunks_.back() + out_buffer_[i].length());
|
||||
}
|
||||
fo->Write(raw_chunks_);
|
||||
fo->Write(encoded_chunks_);
|
||||
for (const std::string& buf : out_buffer_) {
|
||||
fo->Write(dmlc::BeginPtr(buf), buf.length());
|
||||
}
|
||||
}
|
||||
|
||||
class SparsePageLZ4Format : public SparsePage::Format {
|
||||
public:
|
||||
SparsePageLZ4Format()
|
||||
: raw_bytes_(0), encoded_bytes_(0) {
|
||||
nthread_ = 4;
|
||||
raw_bytes_ = encoded_bytes_ = 0;
|
||||
}
|
||||
~SparsePageLZ4Format() {
|
||||
if (raw_bytes_ != 0) {
|
||||
LOG(CONSOLE) << "raw_bytes=" << raw_bytes_
|
||||
<< ", encoded_bytes=" << encoded_bytes_
|
||||
<< ", ratio=" << double(encoded_bytes_) / raw_bytes_;
|
||||
}
|
||||
}
|
||||
|
||||
bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
|
||||
if (!fi->Read(&(page->offset))) return false;
|
||||
CHECK_NE(page->offset.size(), 0) << "Invalid SparsePage file";
|
||||
this->LoadIndexValue(fi);
|
||||
|
||||
page->data.resize(page->offset.back());
|
||||
CHECK_EQ(index_.data.size(), value_.data.size());
|
||||
CHECK_EQ(index_.data.size(), page->data.size());
|
||||
for (size_t i = 0; i < page->data.size(); ++i) {
|
||||
page->data[i] = SparseBatch::Entry(index_.data[i], value_.data[i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Read(SparsePage* page,
|
||||
dmlc::SeekStream* fi,
|
||||
const std::vector<bst_uint>& sorted_index_set) override {
|
||||
if (!fi->Read(&disk_offset_)) return false;
|
||||
this->LoadIndexValue(fi);
|
||||
|
||||
page->offset.clear();
|
||||
page->offset.push_back(0);
|
||||
for (bst_uint cid : sorted_index_set) {
|
||||
page->offset.push_back(
|
||||
page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
|
||||
}
|
||||
page->data.resize(page->offset.back());
|
||||
CHECK_EQ(index_.data.size(), value_.data.size());
|
||||
CHECK_EQ(index_.data.size(), disk_offset_.back());
|
||||
|
||||
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
|
||||
bst_uint cid = sorted_index_set[i];
|
||||
size_t dst_begin = page->offset[i];
|
||||
size_t src_begin = disk_offset_[cid];
|
||||
size_t num = disk_offset_[cid + 1] - disk_offset_[cid];
|
||||
for (size_t j = 0; j < num; ++j) {
|
||||
page->data[dst_begin + j] = SparseBatch::Entry(
|
||||
index_.data[src_begin + j], value_.data[src_begin + j]);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Write(const SparsePage& page, dmlc::Stream* fo) override {
|
||||
CHECK(page.offset.size() != 0 && page.offset[0] == 0);
|
||||
CHECK_EQ(page.offset.back(), page.data.size());
|
||||
fo->Write(page.offset);
|
||||
index_.data.resize(page.data.size());
|
||||
value_.data.resize(page.data.size());
|
||||
|
||||
for (size_t i = 0; i < page.data.size(); ++i) {
|
||||
index_.data[i] = page.data[i].index;
|
||||
value_.data[i] = page.data[i].fvalue;
|
||||
}
|
||||
|
||||
index_.InitCompressChunks(kChunkSize, kMaxChunk);
|
||||
value_.InitCompressChunks(kChunkSize, kMaxChunk);
|
||||
|
||||
int nindex = index_.num_chunk();
|
||||
int nvalue = value_.num_chunk();
|
||||
int ntotal = nindex + nvalue;
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
|
||||
for (int i = 0; i < ntotal; ++i) {
|
||||
if (i < nindex) {
|
||||
index_.Compress(i);
|
||||
} else {
|
||||
value_.Compress(i - nindex);
|
||||
}
|
||||
}
|
||||
index_.Write(fo);
|
||||
value_.Write(fo);
|
||||
raw_bytes_ += index_.RawBytes() + value_.RawBytes() + page.offset.size() * sizeof(size_t);
|
||||
encoded_bytes_ += index_.EncodedBytes() +
|
||||
value_.EncodedBytes() + page.offset.size() * sizeof(size_t);
|
||||
}
|
||||
|
||||
inline void LoadIndexValue(dmlc::SeekStream* fi) {
|
||||
index_.Read(fi);
|
||||
value_.Read(fi);
|
||||
|
||||
int nindex = index_.num_chunk();
|
||||
int nvalue = value_.num_chunk();
|
||||
int ntotal = nindex + nvalue;
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
|
||||
for (int i = 0; i < ntotal; ++i) {
|
||||
if (i < nindex) {
|
||||
index_.Decompress(i);
|
||||
} else {
|
||||
value_.Decompress(i - nindex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// default chunk size.
|
||||
static const size_t kChunkSize = 64 << 10UL;
|
||||
// maximum chunk size.
|
||||
static const size_t kMaxChunk = 64;
|
||||
// number of threads
|
||||
int nthread_;
|
||||
// raw bytes
|
||||
size_t raw_bytes_;
|
||||
// encoded bytes
|
||||
size_t encoded_bytes_;
|
||||
/*! \brief external memory column offset */
|
||||
std::vector<size_t> disk_offset_;
|
||||
// internal index
|
||||
CompressArray<bst_uint> index_;
|
||||
// value set.
|
||||
CompressArray<bst_float> value_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(lz4)
|
||||
.describe("Apply LZ4 binary data compression for ext memory.")
|
||||
.set_body([]() {
|
||||
return new SparsePageLZ4Format();
|
||||
});
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
@ -255,5 +255,8 @@ std::string SparsePage::Format::DecideFormat(const std::string& cache_prefix) {
|
||||
return "raw";
|
||||
}
|
||||
}
|
||||
|
||||
// List of files that will be force linked in static links.
|
||||
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@ -155,7 +155,7 @@ class SparsePage::Format {
|
||||
* \brief save the data to fo, when a page was written.
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void Write(const SparsePage& page, dmlc::Stream* fo) const = 0;
|
||||
virtual void Write(const SparsePage& page, dmlc::Stream* fo) = 0;
|
||||
/*!
|
||||
* \brief Create sparse page of format.
|
||||
* \return The created format functors.
|
||||
|
||||
@ -71,7 +71,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
/*! \brief page size 256 MB */
|
||||
static const size_t kPageSize = 256UL << 20UL;
|
||||
/*! \brief Maximum number of rows per batch. */
|
||||
static const size_t kMaxRowPerBatch = 32UL << 10UL;
|
||||
static const size_t kMaxRowPerBatch = 64UL << 10UL;
|
||||
|
||||
private:
|
||||
// declare the column batch iter.
|
||||
|
||||
@ -4,11 +4,14 @@
|
||||
* Raw binary format of sparse page.
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include "./sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
|
||||
|
||||
class SparsePageRawFormat : public SparsePage::Format {
|
||||
public:
|
||||
bool Read(SparsePage* page, dmlc::SeekStream* fi) override {
|
||||
@ -73,7 +76,7 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
return true;
|
||||
}
|
||||
|
||||
void Write(const SparsePage& page, dmlc::Stream* fo) const override {
|
||||
void Write(const SparsePage& page, dmlc::Stream* fo) override {
|
||||
CHECK(page.offset.size() != 0 && page.offset[0] == 0);
|
||||
CHECK_EQ(page.offset.back(), page.data.size());
|
||||
fo->Write(page.offset);
|
||||
|
||||
@ -108,10 +108,10 @@ class RegLossObj : public ObjFunction {
|
||||
LOG(FATAL) << Loss::LabelErrorMsg();
|
||||
}
|
||||
}
|
||||
virtual const char* DefaultEvalMetric() const {
|
||||
const char* DefaultEvalMetric() const override {
|
||||
return Loss::DefaultEvalMetric();
|
||||
}
|
||||
virtual void PredTransform(std::vector<float> *io_preds) {
|
||||
void PredTransform(std::vector<float> *io_preds) override {
|
||||
std::vector<float> &preds = *io_preds;
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
@ -119,7 +119,7 @@ class RegLossObj : public ObjFunction {
|
||||
preds[j] = Loss::PredTransform(preds[j]);
|
||||
}
|
||||
}
|
||||
virtual float ProbToMargin(float base_score) const {
|
||||
float ProbToMargin(float base_score) const override {
|
||||
return Loss::ProbToMargin(base_score);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user