Optimize adapter element counting on GPU. (#9209)

- Implement a simple `IterSpan` for passing iterators with size.
- Use shared memory for column size counts.
- Use one thread for each sample in row count to reduce atomic operations.
This commit is contained in:
Jiaming Yuan
2023-05-30 23:28:43 +08:00
committed by GitHub
parent 097f11b6e0
commit 17fd3f55e9
9 changed files with 323 additions and 61 deletions

View File

@@ -204,9 +204,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
return {0, e.index, e.fvalue}; // row_idx is not needed for scanning column size.
});
detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
batch_it, dummy_is_valid,
0, sorted_entries.size(),
&cuts_ptr, &column_sizes_scan);
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
@@ -273,9 +272,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
batch_it, dummy_is_valid,
0, sorted_entries.size(),
&cuts_ptr, &column_sizes_scan);
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,