Initial support for column-wise data split (#8468)

This commit is contained in:
Rong Ou
2022-12-03 09:37:51 -08:00
committed by GitHub
parent c0609b98f1
commit 78d65a1928
8 changed files with 135 additions and 3 deletions

View File

@@ -45,6 +45,29 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
return out;
}
DMatrix* SimpleDMatrix::SliceCol(std::size_t start, std::size_t size) {
auto out = new SimpleDMatrix;
SparsePage& out_page = *out->sparse_page_;
for (auto const &page : this->GetBatches<SparsePage>()) {
auto batch = page.GetView();
auto& h_data = out_page.data.HostVector();
auto& h_offset = out_page.offset.HostVector();
size_t rptr{0};
for (auto i = 0; i < this->Info().num_row_; i++) {
auto inst = batch[i];
auto prev_size = h_data.size();
std::copy_if(inst.begin(), inst.end(), std::back_inserter(h_data), [&](Entry e) {
return e.index >= start && e.index < start + size;
});
rptr += h_data.size() - prev_size;
h_offset.emplace_back(rptr);
}
out->Info() = this->Info().Copy();
out->Info().num_nonzero_ = h_offset.back();
}
return out;
}
BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
// since csr is the default data structure so `source_` is always available.
auto begin_iter = BatchIterator<SparsePage>(