Clang-tidy static analysis (#3222)
* Clang-tidy static analysis * Modernise checks * Google coding standard checks * Identifier renaming according to Google style
This commit is contained in:
114
src/data/data.cc
114
src/data/data.cc
@@ -24,51 +24,51 @@ DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg);
|
||||
namespace xgboost {
|
||||
// implementation of inline functions
|
||||
void MetaInfo::Clear() {
|
||||
num_row = num_col = num_nonzero = 0;
|
||||
labels.clear();
|
||||
root_index.clear();
|
||||
group_ptr.clear();
|
||||
weights.clear();
|
||||
base_margin.clear();
|
||||
num_row_ = num_col_ = num_nonzero_ = 0;
|
||||
labels_.clear();
|
||||
root_index_.clear();
|
||||
group_ptr_.clear();
|
||||
weights_.clear();
|
||||
base_margin_.clear();
|
||||
}
|
||||
|
||||
void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
|
||||
int32_t version = kVersion;
|
||||
fo->Write(&version, sizeof(version));
|
||||
fo->Write(&num_row, sizeof(num_row));
|
||||
fo->Write(&num_col, sizeof(num_col));
|
||||
fo->Write(&num_nonzero, sizeof(num_nonzero));
|
||||
fo->Write(labels);
|
||||
fo->Write(group_ptr);
|
||||
fo->Write(weights);
|
||||
fo->Write(root_index);
|
||||
fo->Write(base_margin);
|
||||
fo->Write(&num_row_, sizeof(num_row_));
|
||||
fo->Write(&num_col_, sizeof(num_col_));
|
||||
fo->Write(&num_nonzero_, sizeof(num_nonzero_));
|
||||
fo->Write(labels_);
|
||||
fo->Write(group_ptr_);
|
||||
fo->Write(weights_);
|
||||
fo->Write(root_index_);
|
||||
fo->Write(base_margin_);
|
||||
}
|
||||
|
||||
void MetaInfo::LoadBinary(dmlc::Stream *fi) {
|
||||
int version;
|
||||
CHECK(fi->Read(&version, sizeof(version)) == sizeof(version)) << "MetaInfo: invalid version";
|
||||
CHECK_EQ(version, kVersion) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&num_row, sizeof(num_row)) == sizeof(num_row)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&num_col, sizeof(num_col)) == sizeof(num_col)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&num_nonzero, sizeof(num_nonzero)) == sizeof(num_nonzero))
|
||||
CHECK(fi->Read(&num_row_, sizeof(num_row_)) == sizeof(num_row_)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&num_col_, sizeof(num_col_)) == sizeof(num_col_)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&num_nonzero_, sizeof(num_nonzero_)) == sizeof(num_nonzero_))
|
||||
<< "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&labels)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&group_ptr)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&weights)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&root_index)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&base_margin)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&labels_)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&group_ptr_)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&weights_)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&root_index_)) << "MetaInfo: invalid format";
|
||||
CHECK(fi->Read(&base_margin_)) << "MetaInfo: invalid format";
|
||||
}
|
||||
|
||||
// try to load group information from file, if exists
|
||||
inline bool MetaTryLoadGroup(const std::string& fname,
|
||||
std::vector<unsigned>* group) {
|
||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
|
||||
if (fi.get() == nullptr) return false;
|
||||
if (fi == nullptr) return false;
|
||||
dmlc::istream is(fi.get());
|
||||
group->clear();
|
||||
group->push_back(0);
|
||||
unsigned nline;
|
||||
unsigned nline = 0;
|
||||
while (is >> nline) {
|
||||
group->push_back(group->back() + nline);
|
||||
}
|
||||
@@ -79,7 +79,7 @@ inline bool MetaTryLoadGroup(const std::string& fname,
|
||||
inline bool MetaTryLoadFloatInfo(const std::string& fname,
|
||||
std::vector<bst_float>* data) {
|
||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
|
||||
if (fi.get() == nullptr) return false;
|
||||
if (fi == nullptr) return false;
|
||||
dmlc::istream is(fi.get());
|
||||
data->clear();
|
||||
bst_float value;
|
||||
@@ -93,16 +93,16 @@ inline bool MetaTryLoadFloatInfo(const std::string& fname,
|
||||
#define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc) \
|
||||
switch (dtype) { \
|
||||
case kFloat32: { \
|
||||
const float* cast_ptr = reinterpret_cast<const float*>(old_ptr); proc; break; \
|
||||
auto cast_ptr = reinterpret_cast<const float*>(old_ptr); proc; break; \
|
||||
} \
|
||||
case kDouble: { \
|
||||
const double* cast_ptr = reinterpret_cast<const double*>(old_ptr); proc; break; \
|
||||
auto cast_ptr = reinterpret_cast<const double*>(old_ptr); proc; break; \
|
||||
} \
|
||||
case kUInt32: { \
|
||||
const uint32_t* cast_ptr = reinterpret_cast<const uint32_t*>(old_ptr); proc; break; \
|
||||
auto cast_ptr = reinterpret_cast<const uint32_t*>(old_ptr); proc; break; \
|
||||
} \
|
||||
case kUInt64: { \
|
||||
const uint64_t* cast_ptr = reinterpret_cast<const uint64_t*>(old_ptr); proc; break; \
|
||||
auto cast_ptr = reinterpret_cast<const uint64_t*>(old_ptr); proc; break; \
|
||||
} \
|
||||
default: LOG(FATAL) << "Unknown data type" << dtype; \
|
||||
} \
|
||||
@@ -110,28 +110,28 @@ inline bool MetaTryLoadFloatInfo(const std::string& fname,
|
||||
|
||||
void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
|
||||
if (!std::strcmp(key, "root_index")) {
|
||||
root_index.resize(num);
|
||||
root_index_.resize(num);
|
||||
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
|
||||
std::copy(cast_dptr, cast_dptr + num, root_index.begin()));
|
||||
std::copy(cast_dptr, cast_dptr + num, root_index_.begin()));
|
||||
} else if (!std::strcmp(key, "label")) {
|
||||
labels.resize(num);
|
||||
labels_.resize(num);
|
||||
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
|
||||
std::copy(cast_dptr, cast_dptr + num, labels.begin()));
|
||||
std::copy(cast_dptr, cast_dptr + num, labels_.begin()));
|
||||
} else if (!std::strcmp(key, "weight")) {
|
||||
weights.resize(num);
|
||||
weights_.resize(num);
|
||||
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
|
||||
std::copy(cast_dptr, cast_dptr + num, weights.begin()));
|
||||
std::copy(cast_dptr, cast_dptr + num, weights_.begin()));
|
||||
} else if (!std::strcmp(key, "base_margin")) {
|
||||
base_margin.resize(num);
|
||||
base_margin_.resize(num);
|
||||
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
|
||||
std::copy(cast_dptr, cast_dptr + num, base_margin.begin()));
|
||||
std::copy(cast_dptr, cast_dptr + num, base_margin_.begin()));
|
||||
} else if (!std::strcmp(key, "group")) {
|
||||
group_ptr.resize(num + 1);
|
||||
group_ptr_.resize(num + 1);
|
||||
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
|
||||
std::copy(cast_dptr, cast_dptr + num, group_ptr.begin() + 1));
|
||||
group_ptr[0] = 0;
|
||||
for (size_t i = 1; i < group_ptr.size(); ++i) {
|
||||
group_ptr[i] = group_ptr[i - 1] + group_ptr[i];
|
||||
std::copy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1));
|
||||
group_ptr_[0] = 0;
|
||||
for (size_t i = 1; i < group_ptr_.size(); ++i) {
|
||||
group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -163,7 +163,9 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
<< "-" << rabit::GetWorldSize()
|
||||
<< cache_shards[i].substr(pos, cache_shards[i].length());
|
||||
}
|
||||
if (i + 1 != cache_shards.size()) os << ':';
|
||||
if (i + 1 != cache_shards.size()) {
|
||||
os << ':';
|
||||
}
|
||||
}
|
||||
cache_file = os.str();
|
||||
}
|
||||
@@ -187,7 +189,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
if (file_format == "auto" && npart == 1) {
|
||||
int magic;
|
||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
|
||||
if (fi.get() != nullptr) {
|
||||
if (fi != nullptr) {
|
||||
common::PeekableInStream is(fi.get());
|
||||
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
|
||||
magic == data::SimpleCSRSource::kMagic) {
|
||||
@@ -195,8 +197,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
source->LoadBinary(&is);
|
||||
DMatrix* dmat = DMatrix::Create(std::move(source), cache_file);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
|
||||
<< dmat->info().num_nonzero << " entries loaded from " << uri;
|
||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
@@ -207,26 +209,26 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
|
||||
DMatrix* dmat = DMatrix::Create(parser.get(), cache_file);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
|
||||
<< dmat->info().num_nonzero << " entries loaded from " << uri;
|
||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
}
|
||||
/* sync up number of features after matrix loaded.
|
||||
* partitioned data will fail the train/val validation check
|
||||
* since partitioned data not knowing the real number of features. */
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->info().num_col, 1);
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
|
||||
// backward compatiblity code.
|
||||
if (!load_row_split) {
|
||||
MetaInfo& info = dmat->info();
|
||||
if (MetaTryLoadGroup(fname + ".group", &info.group_ptr) && !silent) {
|
||||
LOG(CONSOLE) << info.group_ptr.size() - 1
|
||||
MetaInfo& info = dmat->Info();
|
||||
if (MetaTryLoadGroup(fname + ".group", &info.group_ptr_) && !silent) {
|
||||
LOG(CONSOLE) << info.group_ptr_.size() - 1
|
||||
<< " groups are loaded from " << fname << ".group";
|
||||
}
|
||||
if (MetaTryLoadFloatInfo(fname + ".base_margin", &info.base_margin) && !silent) {
|
||||
LOG(CONSOLE) << info.base_margin.size()
|
||||
if (MetaTryLoadFloatInfo(fname + ".base_margin", &info.base_margin_) && !silent) {
|
||||
LOG(CONSOLE) << info.base_margin_.size()
|
||||
<< " base_margin are loaded from " << fname << ".base_margin";
|
||||
}
|
||||
if (MetaTryLoadFloatInfo(fname + ".weight", &info.weights) && !silent) {
|
||||
LOG(CONSOLE) << info.weights.size()
|
||||
if (MetaTryLoadFloatInfo(fname + ".weight", &info.weights_) && !silent) {
|
||||
LOG(CONSOLE) << info.weights_.size()
|
||||
<< " weights are loaded from " << fname << ".weight";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ void SimpleCSRSource::Clear() {
|
||||
|
||||
void SimpleCSRSource::CopyFrom(DMatrix* src) {
|
||||
this->Clear();
|
||||
this->info = src->info();
|
||||
this->info = src->Info();
|
||||
dmlc::DataIter<RowBatch>* iter = src->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
@@ -36,10 +36,10 @@ void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
|
||||
while (parser->Next()) {
|
||||
const dmlc::RowBlock<uint32_t>& batch = parser->Value();
|
||||
if (batch.label != nullptr) {
|
||||
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
|
||||
info.labels_.insert(info.labels_.end(), batch.label, batch.label + batch.size);
|
||||
}
|
||||
if (batch.weight != nullptr) {
|
||||
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
|
||||
info.weights_.insert(info.weights_.end(), batch.weight, batch.weight + batch.size);
|
||||
}
|
||||
// Remove the assertion on batch.index, which can be null in the case that the data in this
|
||||
// batch is entirely sparse. Although it's true that this indicates a likely issue with the
|
||||
@@ -48,13 +48,13 @@ void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
|
||||
// CHECK(batch.index != nullptr);
|
||||
|
||||
// update information
|
||||
this->info.num_row += batch.size;
|
||||
this->info.num_row_ += batch.size;
|
||||
// copy the data over
|
||||
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
||||
uint32_t index = batch.index[i];
|
||||
bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
|
||||
row_data_.push_back(SparseBatch::Entry(index, fvalue));
|
||||
this->info.num_col = std::max(this->info.num_col,
|
||||
row_data_.emplace_back(index, fvalue);
|
||||
this->info.num_col_ = std::max(this->info.num_col_,
|
||||
static_cast<uint64_t>(index + 1));
|
||||
}
|
||||
size_t top = row_ptr_.size();
|
||||
@@ -62,7 +62,7 @@ void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
|
||||
row_ptr_.push_back(row_ptr_[top - 1] + batch.offset[i + 1] - batch.offset[0]);
|
||||
}
|
||||
}
|
||||
this->info.num_nonzero = static_cast<uint64_t>(row_data_.size());
|
||||
this->info.num_nonzero_ = static_cast<uint64_t>(row_data_.size());
|
||||
}
|
||||
|
||||
void SimpleCSRSource::LoadBinary(dmlc::Stream* fi) {
|
||||
|
||||
@@ -35,9 +35,9 @@ class SimpleCSRSource : public DataSource {
|
||||
std::vector<RowBatch::Entry> row_data_;
|
||||
// functions
|
||||
/*! \brief default constructor */
|
||||
SimpleCSRSource() : row_ptr_(1, 0), at_first_(true) {}
|
||||
SimpleCSRSource() : row_ptr_(1, 0) {}
|
||||
/*! \brief destructor */
|
||||
virtual ~SimpleCSRSource() {}
|
||||
~SimpleCSRSource() override = default;
|
||||
/*! \brief clear the data structure */
|
||||
void Clear();
|
||||
/*!
|
||||
@@ -72,7 +72,7 @@ class SimpleCSRSource : public DataSource {
|
||||
|
||||
private:
|
||||
/*! \brief internal variable, used to support iterator interface */
|
||||
bool at_first_;
|
||||
bool at_first_{true};
|
||||
/*! \brief */
|
||||
RowBatch batch_;
|
||||
};
|
||||
|
||||
@@ -20,7 +20,7 @@ bool SimpleDMatrix::ColBatchIter::Next() {
|
||||
data_ptr_ += 1;
|
||||
SparsePage* pcol = cpages_[data_ptr_ - 1].get();
|
||||
batch_.size = col_index_.size();
|
||||
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
|
||||
col_data_.resize(col_index_.size(), SparseBatch::Inst(nullptr, 0));
|
||||
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||
const bst_uint ridx = col_index_[i];
|
||||
col_data_[i] = SparseBatch::Inst
|
||||
@@ -33,7 +33,7 @@ bool SimpleDMatrix::ColBatchIter::Next() {
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator() {
|
||||
size_t ncol = this->info().num_col;
|
||||
size_t ncol = this->Info().num_col_;
|
||||
col_iter_.col_index_.resize(ncol);
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
|
||||
@@ -43,10 +43,10 @@ dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator() {
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator(const std::vector<bst_uint>&fset) {
|
||||
size_t ncol = this->info().num_col;
|
||||
size_t ncol = this->Info().num_col_;
|
||||
col_iter_.col_index_.resize(0);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
|
||||
for (auto fidx : fset) {
|
||||
if (fidx < ncol) col_iter_.col_index_.push_back(fidx);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
@@ -56,9 +56,9 @@ void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
|
||||
float pkeep,
|
||||
size_t max_row_perbatch, bool sorted) {
|
||||
if (this->HaveColAccess(sorted)) return;
|
||||
col_iter_.sorted = sorted;
|
||||
col_iter_.sorted_ = sorted;
|
||||
col_iter_.cpages_.clear();
|
||||
if (info().num_row < max_row_perbatch) {
|
||||
if (Info().num_row_ < max_row_perbatch) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeOneBatch(enabled, pkeep, page.get(), sorted);
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
@@ -66,10 +66,10 @@ void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
|
||||
this->MakeManyBatch(enabled, pkeep, max_row_perbatch, sorted);
|
||||
}
|
||||
// setup col-size
|
||||
col_size_.resize(info().num_col);
|
||||
col_size_.resize(Info().num_col_);
|
||||
std::fill(col_size_.begin(), col_size_.end(), 0);
|
||||
for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
|
||||
SparsePage *pcol = col_iter_.cpages_[i].get();
|
||||
for (auto & cpage : col_iter_.cpages_) {
|
||||
SparsePage *pcol = cpage.get();
|
||||
for (size_t j = 0; j < pcol->Size(); ++j) {
|
||||
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
|
||||
}
|
||||
@@ -80,14 +80,14 @@ void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
|
||||
void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled, float pkeep,
|
||||
SparsePage* pcol, bool sorted) {
|
||||
// clear rowset
|
||||
buffered_rowset_.clear();
|
||||
buffered_rowset_.Clear();
|
||||
// bit map
|
||||
const int nthread = omp_get_max_threads();
|
||||
std::vector<bool> bmap;
|
||||
pcol->Clear();
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info().num_col, nthread);
|
||||
builder.InitBudget(Info().num_col_, nthread);
|
||||
// start working
|
||||
dmlc::DataIter<RowBatch>* iter = this->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
@@ -99,9 +99,9 @@ void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled, float pkeep,
|
||||
|
||||
long batch_size = static_cast<long>(batch.size); // NOLINT(*)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
buffered_rowset_.PushBack(ridx);
|
||||
} else {
|
||||
bmap[i] = false;
|
||||
}
|
||||
@@ -109,7 +109,7 @@ void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled, float pkeep,
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
@@ -128,13 +128,13 @@ void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled, float pkeep,
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.Push(inst[j].index,
|
||||
SparseBatch::Entry((bst_uint)(batch.base_rowid+i),
|
||||
SparseBatch::Entry(static_cast<bst_uint>(batch.base_rowid+i),
|
||||
inst[j].fvalue), tid);
|
||||
}
|
||||
}
|
||||
@@ -142,11 +142,11 @@ void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled, float pkeep,
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_EQ(pcol->Size(), info().num_col);
|
||||
CHECK_EQ(pcol->Size(), Info().num_col_);
|
||||
|
||||
if (sorted) {
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
auto ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
@@ -164,7 +164,7 @@ void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
|
||||
size_t btop = 0;
|
||||
std::bernoulli_distribution coin_flip(pkeep);
|
||||
auto& rnd = common::GlobalRandom();
|
||||
buffered_rowset_.clear();
|
||||
buffered_rowset_.Clear();
|
||||
// internal temp cache
|
||||
SparsePage tmp; tmp.Clear();
|
||||
// start working
|
||||
@@ -174,16 +174,16 @@ void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
buffered_rowset_.PushBack(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
}
|
||||
if (tmp.Size() >= max_row_perbatch) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeColPage(tmp.GetRowBatch(0), btop, enabled, page.get(), sorted);
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
btop = buffered_rowset_.size();
|
||||
btop = buffered_rowset_.Size();
|
||||
tmp.Clear();
|
||||
}
|
||||
}
|
||||
@@ -205,7 +205,7 @@ void SimpleDMatrix::MakeColPage(const RowBatch& batch,
|
||||
pcol->Clear();
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info().num_col, nthread);
|
||||
builder.InitBudget(Info().num_col_, nthread);
|
||||
bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
@@ -231,10 +231,10 @@ void SimpleDMatrix::MakeColPage(const RowBatch& batch,
|
||||
tid);
|
||||
}
|
||||
}
|
||||
CHECK_EQ(pcol->Size(), info().num_col);
|
||||
CHECK_EQ(pcol->Size(), Info().num_col_);
|
||||
// sort columns
|
||||
if (sorted) {
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
auto ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
|
||||
@@ -22,11 +22,11 @@ class SimpleDMatrix : public DMatrix {
|
||||
explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
|
||||
: source_(std::move(source)) {}
|
||||
|
||||
MetaInfo& info() override {
|
||||
MetaInfo& Info() override {
|
||||
return source_->info;
|
||||
}
|
||||
|
||||
const MetaInfo& info() const override {
|
||||
const MetaInfo& Info() const override {
|
||||
return source_->info;
|
||||
}
|
||||
|
||||
@@ -37,10 +37,10 @@ class SimpleDMatrix : public DMatrix {
|
||||
}
|
||||
|
||||
bool HaveColAccess(bool sorted) const override {
|
||||
return col_size_.size() != 0 && col_iter_.sorted == sorted;
|
||||
return col_size_.size() != 0 && col_iter_.sorted_ == sorted;
|
||||
}
|
||||
|
||||
const RowSet& buffered_rowset() const override {
|
||||
const RowSet& BufferedRowset() const override {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
|
||||
@@ -49,8 +49,8 @@ class SimpleDMatrix : public DMatrix {
|
||||
}
|
||||
|
||||
float GetColDensity(size_t cidx) const override {
|
||||
size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
size_t nmiss = buffered_rowset_.Size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.Size();
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* ColIterator() override;
|
||||
@@ -67,7 +67,7 @@ class SimpleDMatrix : public DMatrix {
|
||||
// in-memory column batch iterator.
|
||||
struct ColBatchIter: dmlc::DataIter<ColBatch> {
|
||||
public:
|
||||
ColBatchIter() : data_ptr_(0), sorted(false) {}
|
||||
ColBatchIter() = default;
|
||||
void BeforeFirst() override {
|
||||
data_ptr_ = 0;
|
||||
}
|
||||
@@ -86,11 +86,11 @@ class SimpleDMatrix : public DMatrix {
|
||||
// column sparse pages
|
||||
std::vector<std::unique_ptr<SparsePage> > cpages_;
|
||||
// data pointer
|
||||
size_t data_ptr_;
|
||||
size_t data_ptr_{0};
|
||||
// temporal space for batch
|
||||
ColBatch batch_;
|
||||
// Is column sorted?
|
||||
bool sorted;
|
||||
bool sorted_{false};
|
||||
};
|
||||
|
||||
// source data pointer.
|
||||
|
||||
@@ -51,11 +51,11 @@ class SparsePage {
|
||||
return offset.size() - 1;
|
||||
}
|
||||
/*! \return estimation of memory cost of this page */
|
||||
inline size_t MemCostBytes(void) const {
|
||||
inline size_t MemCostBytes() const {
|
||||
return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
|
||||
}
|
||||
/*! \brief clear the page */
|
||||
inline void Clear(void) {
|
||||
inline void Clear() {
|
||||
min_index = 0;
|
||||
offset.clear();
|
||||
offset.push_back(0);
|
||||
@@ -92,7 +92,7 @@ class SparsePage {
|
||||
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
||||
uint32_t index = batch.index[i];
|
||||
bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
|
||||
data.push_back(SparseBatch::Entry(index, fvalue));
|
||||
data.emplace_back(index, fvalue);
|
||||
}
|
||||
CHECK_EQ(offset.back(), data.size());
|
||||
}
|
||||
@@ -145,7 +145,7 @@ class SparsePage {
|
||||
class SparsePage::Format {
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~Format() {}
|
||||
virtual ~Format() = default;
|
||||
/*!
|
||||
* \brief Load all the segments into page, advance fi to end of the block.
|
||||
* \param page The data to read page into.
|
||||
|
||||
@@ -94,9 +94,9 @@ void SparsePageDMatrix::ColPageIter::Init(const std::vector<bst_uint>& index_set
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* SparsePageDMatrix::ColIterator() {
|
||||
CHECK(col_iter_.get() != nullptr);
|
||||
CHECK(col_iter_ != nullptr);
|
||||
std::vector<bst_uint> col_index;
|
||||
size_t ncol = this->info().num_col;
|
||||
size_t ncol = this->Info().num_col_;
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_index.push_back(static_cast<bst_uint>(i));
|
||||
}
|
||||
@@ -106,12 +106,12 @@ dmlc::DataIter<ColBatch>* SparsePageDMatrix::ColIterator() {
|
||||
|
||||
dmlc::DataIter<ColBatch>* SparsePageDMatrix::
|
||||
ColIterator(const std::vector<bst_uint>& fset) {
|
||||
CHECK(col_iter_.get() != nullptr);
|
||||
CHECK(col_iter_ != nullptr);
|
||||
std::vector<bst_uint> col_index;
|
||||
size_t ncol = this->info().num_col;
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (fset[i] < ncol) {
|
||||
col_index.push_back(fset[i]);
|
||||
size_t ncol = this->Info().num_col_;
|
||||
for (auto fidx : fset) {
|
||||
if (fidx < ncol) {
|
||||
col_index.push_back(fidx);
|
||||
}
|
||||
}
|
||||
col_iter_->Init(col_index, false);
|
||||
@@ -126,7 +126,7 @@ bool SparsePageDMatrix::TryInitColData(bool sorted) {
|
||||
std::string col_meta_name = cache_shards[0] + ".col.meta";
|
||||
std::unique_ptr<dmlc::Stream> fmeta(
|
||||
dmlc::Stream::Create(col_meta_name.c_str(), "r", true));
|
||||
if (fmeta.get() == nullptr) return false;
|
||||
if (fmeta == nullptr) return false;
|
||||
CHECK(fmeta->Read(&buffered_rowset_)) << "invalid col.meta file";
|
||||
CHECK(fmeta->Read(&col_size_)) << "invalid col.meta file";
|
||||
}
|
||||
@@ -136,7 +136,7 @@ bool SparsePageDMatrix::TryInitColData(bool sorted) {
|
||||
std::string col_data_name = prefix + ".col.page";
|
||||
std::unique_ptr<dmlc::SeekStream> fdata(
|
||||
dmlc::SeekStream::CreateForRead(col_data_name.c_str(), true));
|
||||
if (fdata.get() == nullptr) return false;
|
||||
if (fdata == nullptr) return false;
|
||||
files.push_back(std::move(fdata));
|
||||
}
|
||||
col_iter_.reset(new ColPageIter(std::move(files)));
|
||||
@@ -150,12 +150,12 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
size_t max_row_perbatch, bool sorted) {
|
||||
if (HaveColAccess(sorted)) return;
|
||||
if (TryInitColData(sorted)) return;
|
||||
const MetaInfo& info = this->info();
|
||||
const MetaInfo& info = this->Info();
|
||||
if (max_row_perbatch == std::numeric_limits<size_t>::max()) {
|
||||
max_row_perbatch = kMaxRowPerBatch;
|
||||
}
|
||||
buffered_rowset_.clear();
|
||||
col_size_.resize(info.num_col);
|
||||
buffered_rowset_.Clear();
|
||||
col_size_.resize(info.num_col_);
|
||||
std::fill(col_size_.begin(), col_size_.end(), 0);
|
||||
dmlc::DataIter<RowBatch>* iter = this->RowIterator();
|
||||
std::bernoulli_distribution coin_flip(pkeep);
|
||||
@@ -173,7 +173,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
const int nthread = std::max(omp_get_max_threads(), std::max(omp_get_num_procs() / 2 - 1, 1));
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info.num_col, nthread);
|
||||
builder.InitBudget(info.num_col_, nthread);
|
||||
bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
@@ -196,10 +196,10 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
tid);
|
||||
}
|
||||
}
|
||||
CHECK_EQ(pcol->Size(), info.num_col);
|
||||
CHECK_EQ(pcol->Size(), info.num_col_);
|
||||
// sort columns
|
||||
if (sorted) {
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
auto ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
@@ -213,16 +213,16 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
|
||||
auto make_next_col = [&] (SparsePage* dptr) {
|
||||
tmp.Clear();
|
||||
size_t btop = buffered_rowset_.size();
|
||||
size_t btop = buffered_rowset_.Size();
|
||||
|
||||
while (true) {
|
||||
if (batch_ptr != batch_top) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
CHECK_EQ(batch_top, batch.size);
|
||||
for (size_t i = batch_ptr; i < batch_top; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
buffered_rowset_.PushBack(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
}
|
||||
|
||||
@@ -263,7 +263,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
double tstart = dmlc::GetTime();
|
||||
size_t bytes_write = 0;
|
||||
// print every 4 sec.
|
||||
const double kStep = 4.0;
|
||||
constexpr double kStep = 4.0;
|
||||
size_t tick_expected = kStep;
|
||||
|
||||
while (make_next_col(page.get())) {
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/threadediter.h>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
@@ -22,15 +23,15 @@ namespace data {
|
||||
class SparsePageDMatrix : public DMatrix {
|
||||
public:
|
||||
explicit SparsePageDMatrix(std::unique_ptr<DataSource>&& source,
|
||||
const std::string& cache_info)
|
||||
: source_(std::move(source)), cache_info_(cache_info) {
|
||||
std::string cache_info)
|
||||
: source_(std::move(source)), cache_info_(std::move(cache_info)) {
|
||||
}
|
||||
|
||||
MetaInfo& info() override {
|
||||
MetaInfo& Info() override {
|
||||
return source_->info;
|
||||
}
|
||||
|
||||
const MetaInfo& info() const override {
|
||||
const MetaInfo& Info() const override {
|
||||
return source_->info;
|
||||
}
|
||||
|
||||
@@ -41,10 +42,10 @@ class SparsePageDMatrix : public DMatrix {
|
||||
}
|
||||
|
||||
bool HaveColAccess(bool sorted) const override {
|
||||
return col_iter_.get() != nullptr && col_iter_->sorted == sorted;
|
||||
return col_iter_ != nullptr && col_iter_->sorted == sorted;
|
||||
}
|
||||
|
||||
const RowSet& buffered_rowset() const override {
|
||||
const RowSet& BufferedRowset() const override {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
|
||||
@@ -53,8 +54,8 @@ class SparsePageDMatrix : public DMatrix {
|
||||
}
|
||||
|
||||
float GetColDensity(size_t cidx) const override {
|
||||
size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
size_t nmiss = buffered_rowset_.Size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.Size();
|
||||
}
|
||||
|
||||
bool SingleColBlock() const override {
|
||||
@@ -79,7 +80,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
class ColPageIter : public dmlc::DataIter<ColBatch> {
|
||||
public:
|
||||
explicit ColPageIter(std::vector<std::unique_ptr<dmlc::SeekStream> >&& files);
|
||||
virtual ~ColPageIter();
|
||||
~ColPageIter() override;
|
||||
void BeforeFirst() override;
|
||||
const ColBatch &Value() const override {
|
||||
return out_;
|
||||
|
||||
@@ -34,8 +34,7 @@ class SparsePageRawFormat : public SparsePage::Format {
|
||||
// setup the offset
|
||||
page->offset.clear();
|
||||
page->offset.push_back(0);
|
||||
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
|
||||
bst_uint fid = sorted_index_set[i];
|
||||
for (unsigned int fid : sorted_index_set) {
|
||||
CHECK_LT(fid + 1, disk_offset_.size());
|
||||
size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
|
||||
page->offset.push_back(page->offset.back() + size);
|
||||
|
||||
@@ -89,12 +89,12 @@ bool SparsePageSource::CacheExist(const std::string& cache_info) {
|
||||
{
|
||||
std::string name_info = cache_shards[0];
|
||||
std::unique_ptr<dmlc::Stream> finfo(dmlc::Stream::Create(name_info.c_str(), "r", true));
|
||||
if (finfo.get() == nullptr) return false;
|
||||
if (finfo == nullptr) return false;
|
||||
}
|
||||
for (const std::string& prefix : cache_shards) {
|
||||
std::string name_row = prefix + ".row.page";
|
||||
std::unique_ptr<dmlc::Stream> frow(dmlc::Stream::Create(name_row.c_str(), "r", true));
|
||||
if (frow.get() == nullptr) return false;
|
||||
if (frow == nullptr) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -119,22 +119,22 @@ void SparsePageSource::Create(dmlc::Parser<uint32_t>* src,
|
||||
size_t bytes_write = 0;
|
||||
double tstart = dmlc::GetTime();
|
||||
// print every 4 sec.
|
||||
const double kStep = 4.0;
|
||||
constexpr double kStep = 4.0;
|
||||
size_t tick_expected = static_cast<double>(kStep);
|
||||
|
||||
while (src->Next()) {
|
||||
const dmlc::RowBlock<uint32_t>& batch = src->Value();
|
||||
if (batch.label != nullptr) {
|
||||
info.labels.insert(info.labels.end(), batch.label, batch.label + batch.size);
|
||||
info.labels_.insert(info.labels_.end(), batch.label, batch.label + batch.size);
|
||||
}
|
||||
if (batch.weight != nullptr) {
|
||||
info.weights.insert(info.weights.end(), batch.weight, batch.weight + batch.size);
|
||||
info.weights_.insert(info.weights_.end(), batch.weight, batch.weight + batch.size);
|
||||
}
|
||||
info.num_row += batch.size;
|
||||
info.num_nonzero += batch.offset[batch.size] - batch.offset[0];
|
||||
info.num_row_ += batch.size;
|
||||
info.num_nonzero_ += batch.offset[batch.size] - batch.offset[0];
|
||||
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
||||
uint32_t index = batch.index[i];
|
||||
info.num_col = std::max(info.num_col,
|
||||
info.num_col_ = std::max(info.num_col_,
|
||||
static_cast<uint64_t>(index + 1));
|
||||
}
|
||||
page->Push(batch);
|
||||
@@ -183,7 +183,7 @@ void SparsePageSource::Create(DMatrix* src,
|
||||
std::shared_ptr<SparsePage> page;
|
||||
writer.Alloc(&page); page->Clear();
|
||||
|
||||
MetaInfo info = src->info();
|
||||
MetaInfo info = src->Info();
|
||||
size_t bytes_write = 0;
|
||||
double tstart = dmlc::GetTime();
|
||||
dmlc::DataIter<RowBatch>* iter = src->RowIterator();
|
||||
|
||||
@@ -33,7 +33,7 @@ class SparsePageSource : public DataSource {
|
||||
*/
|
||||
explicit SparsePageSource(const std::string& cache_prefix) noexcept(false);
|
||||
/*! \brief destructor */
|
||||
virtual ~SparsePageSource();
|
||||
~SparsePageSource() override;
|
||||
// implement Next
|
||||
bool Next() override;
|
||||
// implement BeforeFirst
|
||||
|
||||
@@ -34,7 +34,7 @@ SparsePage::Writer::Writer(
|
||||
fo->Write(format_shard);
|
||||
std::shared_ptr<SparsePage> page;
|
||||
while (wqueue->Pop(&page)) {
|
||||
if (page.get() == nullptr) break;
|
||||
if (page == nullptr) break;
|
||||
fmt->Write(*page, fo.get());
|
||||
qrecycle_.Push(std::move(page));
|
||||
}
|
||||
@@ -61,7 +61,7 @@ void SparsePage::Writer::PushWrite(std::shared_ptr<SparsePage>&& page) {
|
||||
}
|
||||
|
||||
void SparsePage::Writer::Alloc(std::shared_ptr<SparsePage>* out_page) {
|
||||
CHECK(out_page->get() == nullptr);
|
||||
CHECK(*out_page == nullptr);
|
||||
if (num_free_buffer_ != 0) {
|
||||
out_page->reset(new SparsePage());
|
||||
--num_free_buffer_;
|
||||
|
||||
Reference in New Issue
Block a user