[MEM] Add rowset struct to save memory with billion level rows
This commit is contained in:
@@ -184,9 +184,7 @@ void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
|
||||
}
|
||||
if (tmp.Size() >= max_row_perbatch) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
dmlc::BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page.get());
|
||||
this->MakeColPage(tmp.GetRowBatch(0), btop, enabled, page.get());
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
btop = buffered_rowset_.size();
|
||||
tmp.Clear();
|
||||
@@ -196,16 +194,14 @@ void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
|
||||
|
||||
if (tmp.Size() != 0) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
dmlc::BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page.get());
|
||||
this->MakeColPage(tmp.GetRowBatch(0), btop, enabled, page.get());
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
}
|
||||
}
|
||||
|
||||
// make column page from subset of rowbatchs
|
||||
void SimpleDMatrix::MakeColPage(const RowBatch& batch,
|
||||
const bst_uint* ridx,
|
||||
size_t buffer_begin,
|
||||
const std::vector<bool>& enabled,
|
||||
SparsePage* pcol) {
|
||||
int nthread;
|
||||
@@ -240,9 +236,10 @@ void SimpleDMatrix::MakeColPage(const RowBatch& batch,
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
builder.Push(e.index,
|
||||
SparseBatch::Entry(ridx[i], e.fvalue),
|
||||
tid);
|
||||
builder.Push(
|
||||
e.index,
|
||||
SparseBatch::Entry(buffered_rowset_[i + buffer_begin], e.fvalue),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
CHECK_EQ(pcol->Size(), info().num_col);
|
||||
|
||||
@@ -40,7 +40,7 @@ class SimpleDMatrix : public DMatrix {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
|
||||
const std::vector<bst_uint>& buffered_rowset() const override {
|
||||
const RowSet& buffered_rowset() const override {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ class SimpleDMatrix : public DMatrix {
|
||||
// column iterator
|
||||
ColBatchIter col_iter_;
|
||||
// list of row index that are buffered.
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
RowSet buffered_rowset_;
|
||||
/*! \brief sizeof column data */
|
||||
std::vector<size_t> col_size_;
|
||||
|
||||
@@ -110,7 +110,7 @@ class SimpleDMatrix : public DMatrix {
|
||||
size_t max_row_perbatch);
|
||||
|
||||
void MakeColPage(const RowBatch& batch,
|
||||
const bst_uint* ridx,
|
||||
size_t buffer_begin,
|
||||
const std::vector<bool>& enabled,
|
||||
SparsePage* pcol);
|
||||
};
|
||||
|
||||
@@ -165,10 +165,10 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
// function to create the page.
|
||||
auto make_col_batch = [&] (
|
||||
const SparsePage& prow,
|
||||
const bst_uint* ridx,
|
||||
size_t begin,
|
||||
SparsePage *pcol) {
|
||||
pcol->Clear();
|
||||
pcol->min_index = ridx[0];
|
||||
pcol->min_index = buffered_rowset_[begin];
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
@@ -196,7 +196,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
|
||||
const SparseBatch::Entry &e = prow.data[j];
|
||||
builder.Push(e.index,
|
||||
SparseBatch::Entry(ridx[i], e.fvalue),
|
||||
SparseBatch::Entry(buffered_rowset_[i + begin], e.fvalue),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
@@ -230,7 +230,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
|
||||
if (tmp.Size() >= max_row_perbatch ||
|
||||
tmp.MemCostBytes() >= kPageSize) {
|
||||
make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
|
||||
make_col_batch(tmp, btop, dptr);
|
||||
batch_ptr = i + 1;
|
||||
return true;
|
||||
}
|
||||
@@ -243,7 +243,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
|
||||
}
|
||||
|
||||
if (tmp.Size() != 0) {
|
||||
make_col_batch(tmp, dmlc::BeginPtr(buffered_rowset_) + btop, dptr);
|
||||
make_col_batch(tmp, btop, dptr);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
||||
@@ -44,7 +44,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
return col_iter_.get() != nullptr;
|
||||
}
|
||||
|
||||
const std::vector<bst_uint>& buffered_rowset() const override {
|
||||
const RowSet& buffered_rowset() const override {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
|
||||
@@ -120,7 +120,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
// the cache prefix
|
||||
std::string cache_info_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
RowSet buffered_rowset_;
|
||||
// count for column data
|
||||
std::vector<size_t> col_size_;
|
||||
// internal column iter.
|
||||
|
||||
Reference in New Issue
Block a user