Optimize adapter element counting on GPU. (#9209)

- Implement a simple `IterSpan` for passing iterators with size. - Use shared memory for column size counts. - Use one thread for each sample in row count to reduce atomic operations.
2023-05-30 23:28:43 +08:00
parent 097f11b6e0
commit 17fd3f55e9
9 changed files with 323 additions and 61 deletions
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -50,7 +50,7 @@ void TestSketchUnique(float sparsity) {
        thrust::make_counting_iterator(0llu),
        [=] __device__(size_t idx) { return batch.GetElement(idx); });
    auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, batch_iter, is_valid, 0, end,
+    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                               &cut_sizes_scan, &column_sizes_scan);
    auto const& cut_sizes = cut_sizes_scan.HostVector();
    ASSERT_LE(sketch.Data().size(), cut_sizes.back());