Optimize cpu sketch allreduce for sparse data. (#6009)

* Bypass RABIT serialization reducer and use custom allgather based merging.
This commit is contained in:
Jiaming Yuan
2020-08-19 10:03:45 +08:00
committed by GitHub
parent 90355b4f00
commit 29b7fea572
10 changed files with 357 additions and 87 deletions

View File

@@ -116,26 +116,14 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins) {
for (auto& column : column_sizes) {
column.resize(info.num_col_, 0);
}
for (auto const& page : m->GetBatches<SparsePage>()) {
page.data.HostVector();
page.offset.HostVector();
ParallelFor(page.Size(), threads, [&](size_t i) {
auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
auto row = page[i];
auto const *p_row = row.data();
for (size_t j = 0; j < row.size(); ++j) {
local_column_sizes.at(p_row[j].index)++;
}
});
}
std::vector<bst_row_t> reduced(info.num_col_, 0);
ParallelFor(info.num_col_, threads, [&](size_t i) {
for (auto const &thread : column_sizes) {
reduced[i] += thread[i];
for (auto const& page : m->GetBatches<SparsePage>()) {
auto const &entries_per_column =
HostSketchContainer::CalcColumnSize(page, info.num_col_, threads);
for (size_t i = 0; i < entries_per_column.size(); ++i) {
reduced[i] += entries_per_column[i];
}
});
}
HostSketchContainer container(reduced, max_bins,
HostSketchContainer::UseGroup(info));
for (auto const &page : m->GetBatches<SparsePage>()) {