Optimize adapter element counting on GPU. (#9209)

- Implement a simple `IterSpan` for passing iterators with size.
- Use shared memory for column size counts.
- Use one thread for each sample in row count to reduce atomic operations.
This commit is contained in:
Jiaming Yuan
2023-05-30 23:28:43 +08:00
committed by GitHub
parent 097f11b6e0
commit 17fd3f55e9
9 changed files with 323 additions and 61 deletions

View File

@@ -50,7 +50,7 @@ void TestSketchUnique(float sparsity) {
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); });
auto end = kCols * kRows;
detail::GetColumnSizesScan(0, kCols, n_cuts, batch_iter, is_valid, 0, end,
detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
&cut_sizes_scan, &column_sizes_scan);
auto const& cut_sizes = cut_sizes_scan.HostVector();
ASSERT_LE(sketch.Data().size(), cut_sizes.back());