Optimize adapter element counting on GPU. (#9209)
- Implement a simple `IterSpan` for passing iterators with size. - Use shared memory for column size counts. - Use one thread for each sample in row count to reduce atomic operations.
This commit is contained in:
@@ -50,7 +50,7 @@ void TestSketchUnique(float sparsity) {
|
||||
thrust::make_counting_iterator(0llu),
|
||||
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
||||
auto end = kCols * kRows;
|
||||
detail::GetColumnSizesScan(0, kCols, n_cuts, batch_iter, is_valid, 0, end,
|
||||
detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
|
||||
&cut_sizes_scan, &column_sizes_scan);
|
||||
auto const& cut_sizes = cut_sizes_scan.HostVector();
|
||||
ASSERT_LE(sketch.Data().size(), cut_sizes.back());
|
||||
|
||||
Reference in New Issue
Block a user