From b27b51f60ebdcc564978e904cfb067b8f1a0cfa5 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 25 Jan 2016 11:56:16 -0800 Subject: [PATCH] [PLUGIN] Add densify parser --- dmlc-core | 2 +- make/travis.mk | 1 + plugin/dense_parser/dense_libsvm.cc | 85 +++++++++++++++++++++++++++++ plugin/dense_parser/plugin.mk | 2 + src/data/data.cc | 2 +- 5 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 plugin/dense_parser/dense_libsvm.cc create mode 100644 plugin/dense_parser/plugin.mk diff --git a/dmlc-core b/dmlc-core index c66d2ab2d..e0a18eb45 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit c66d2ab2d30f55303b65b5ed9dc1f9ee04260f7e +Subproject commit e0a18eb45cb9c6e7314dbd3328dda158e3a3486f diff --git a/make/travis.mk b/make/travis.mk index 82a9696bd..85f53ca32 100644 --- a/make/travis.mk +++ b/make/travis.mk @@ -31,3 +31,4 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server # XGB_PLUGINS += plugin/example/plugin.mk XGB_PLUGINS += plugin/lz4/plugin.mk +XGB_PLUGINS += plugin/dense_libsvm/plugin.mk diff --git a/plugin/dense_parser/dense_libsvm.cc b/plugin/dense_parser/dense_libsvm.cc new file mode 100644 index 000000000..45153c64b --- /dev/null +++ b/plugin/dense_parser/dense_libsvm.cc @@ -0,0 +1,85 @@ +/*! + * Copyright 2015 by Contributors + * \file dense_libsvm.cc + * \brief Plugin to load in libsvm, but fill all the missing entries with zeros. + * This plugin is mainly used for benchmark purposes and do not need to be included. + */ +#include +#include + +namespace dmlc { +namespace data { + +template +class DensifyParser : public dmlc::Parser { + public: + DensifyParser(dmlc::Parser* parser, uint32_t num_col) + : parser_(parser), num_col_(num_col) { + } + + void BeforeFirst() override { + parser_->BeforeFirst(); + } + + bool Next() override { + if (!parser_->Next()) return false; + const RowBlock& batch = parser_->Value(); + LOG(INFO) << batch.size; + dense_index_.resize(num_col_ * batch.size); + dense_value_.resize(num_col_ * batch.size); + std::fill(dense_value_.begin(), dense_value_.end(), 0.0f); + offset_.resize(batch.size + 1); + offset_[0] = 0; + + for (size_t i = 0; i < batch.size; ++i) { + offset_[i + 1] = (i + 1) * num_col_; + Row row = batch[i]; + for (uint32_t j = 0; j < num_col_; ++j) { + dense_index_[i * num_col_ + j] = j; + } + for (unsigned k = 0; k < row.length; ++k) { + uint32_t index = row.get_index(k); + CHECK_LT(index, num_col_) + << "Featuere index larger than num_col"; + dense_value_[i * num_col_ + index] = row.get_value(k); + } + } + out_ = batch; + out_.index = dmlc::BeginPtr(dense_index_); + out_.value = dmlc::BeginPtr(dense_value_); + out_.offset = dmlc::BeginPtr(offset_); + return true; + } + + const dmlc::RowBlock& Value() const override { + return out_; + } + + size_t BytesRead() const override { + return parser_->BytesRead(); + } + + private: + RowBlock out_; + std::unique_ptr > parser_; + uint32_t num_col_; + std::vector offset_; + std::vector dense_index_; + std::vector dense_value_; +}; + +template +Parser * +CreateDenseLibSVMParser(const std::string& path, + const std::map& args, + unsigned part_index, + unsigned num_parts) { + CHECK_NE(args.count("num_col"), 0) << "expect num_col in dense_libsvm"; + return new DensifyParser( + Parser::Create(path.c_str(), part_index, num_parts, "libsvm"), + uint32_t(atoi(args.at("num_col").c_str()))); +} +} // namespace data + +DMLC_REGISTER_DATA_PARSER(uint32_t, dense_libsvm, data::CreateDenseLibSVMParser); +} // namespace dmlc diff --git a/plugin/dense_parser/plugin.mk b/plugin/dense_parser/plugin.mk new file mode 100644 index 000000000..027cc42f8 --- /dev/null +++ b/plugin/dense_parser/plugin.mk @@ -0,0 +1,2 @@ +PLUGIN_OBJS += build_plugin/dense_parser/dense_libsvm.o +PLUGIN_LDFLAGS += diff --git a/src/data/data.cc b/src/data/data.cc index 9c63f8aa2..02b972d83 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -181,7 +181,7 @@ DMatrix* DMatrix::Load(const std::string& uri, std::string ftype = file_format; if (file_format == "auto") ftype = "libsvm"; std::unique_ptr > parser( - dmlc::Parser::Create(fname.c_str(), partid, npart, ftype.c_str())); + dmlc::Parser::Create(fname.c_str(), partid, npart, file_format.c_str())); DMatrix* dmat = DMatrix::Create(parser.get(), cache_file); if (!silent) { LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "