* Fix various typos * Add override to functions that are overridden gcc gives warnings about functions that are being overridden by not being marked as oveirridden. This fixes it. * Use bst_float consistently Use bst_float for all the variables that involve weight, leaf value, gradient, hessian, gain, loss_chg, predictions, base_margin, feature values. In some cases, when due to additions and so on the value can take a larger value, double is used. This ensures that type conversions are minimal and reduces loss of precision.
87 lines
2.6 KiB
C++
87 lines
2.6 KiB
C++
/*!
|
|
* Copyright 2015 by Contributors
|
|
* \file dense_libsvm.cc
|
|
* \brief Plugin to load in libsvm, but fill all the missing entries with zeros.
|
|
* This plugin is mainly used for benchmark purposes and do not need to be included.
|
|
*/
|
|
#include <xgboost/base.h>
|
|
#include <dmlc/data.h>
|
|
#include <memory>
|
|
|
|
namespace dmlc {
|
|
namespace data {
|
|
|
|
template<typename IndexType>
|
|
class DensifyParser : public dmlc::Parser<IndexType> {
|
|
public:
|
|
DensifyParser(dmlc::Parser<IndexType>* parser, uint32_t num_col)
|
|
: parser_(parser), num_col_(num_col) {
|
|
}
|
|
|
|
void BeforeFirst() override {
|
|
parser_->BeforeFirst();
|
|
}
|
|
|
|
bool Next() override {
|
|
if (!parser_->Next()) return false;
|
|
const RowBlock<IndexType>& batch = parser_->Value();
|
|
LOG(INFO) << batch.size;
|
|
dense_index_.resize(num_col_ * batch.size);
|
|
dense_value_.resize(num_col_ * batch.size);
|
|
std::fill(dense_value_.begin(), dense_value_.end(), 0.0);
|
|
offset_.resize(batch.size + 1);
|
|
offset_[0] = 0;
|
|
|
|
for (size_t i = 0; i < batch.size; ++i) {
|
|
offset_[i + 1] = (i + 1) * num_col_;
|
|
Row<IndexType> row = batch[i];
|
|
for (uint32_t j = 0; j < num_col_; ++j) {
|
|
dense_index_[i * num_col_ + j] = j;
|
|
}
|
|
for (unsigned k = 0; k < row.length; ++k) {
|
|
uint32_t index = row.get_index(k);
|
|
CHECK_LT(index, num_col_)
|
|
<< "Featuere index larger than num_col";
|
|
dense_value_[i * num_col_ + index] = row.get_value(k);
|
|
}
|
|
}
|
|
out_ = batch;
|
|
out_.index = dmlc::BeginPtr(dense_index_);
|
|
out_.value = dmlc::BeginPtr(dense_value_);
|
|
out_.offset = dmlc::BeginPtr(offset_);
|
|
return true;
|
|
}
|
|
|
|
const dmlc::RowBlock<IndexType>& Value() const override {
|
|
return out_;
|
|
}
|
|
|
|
size_t BytesRead() const override {
|
|
return parser_->BytesRead();
|
|
}
|
|
|
|
private:
|
|
RowBlock<IndexType> out_;
|
|
std::unique_ptr<Parser<IndexType> > parser_;
|
|
uint32_t num_col_;
|
|
std::vector<size_t> offset_;
|
|
std::vector<IndexType> dense_index_;
|
|
std::vector<xgboost::bst_float> dense_value_;
|
|
};
|
|
|
|
template<typename IndexType>
|
|
Parser<IndexType> *
|
|
CreateDenseLibSVMParser(const std::string& path,
|
|
const std::map<std::string, std::string>& args,
|
|
unsigned part_index,
|
|
unsigned num_parts) {
|
|
CHECK_NE(args.count("num_col"), 0) << "expect num_col in dense_libsvm";
|
|
return new DensifyParser<IndexType>(
|
|
Parser<IndexType>::Create(path.c_str(), part_index, num_parts, "libsvm"),
|
|
uint32_t(atoi(args.at("num_col").c_str())));
|
|
}
|
|
} // namespace data
|
|
|
|
DMLC_REGISTER_DATA_PARSER(uint32_t, dense_libsvm, data::CreateDenseLibSVMParser<uint32_t>);
|
|
} // namespace dmlc
|