Implement column sampler in CUDA. (#9785)

- CUDA implementation.
- Extract the broadcasting logic, we will need the context parameter after revamping the collective implementation.
- Some changes to the event loop for fixing a deadlock in CI.
- Move argsort into algorithms.cuh, add support for cuda stream.
This commit is contained in:
Jiaming Yuan
2023-11-17 04:29:08 +08:00
committed by GitHub
parent 178cfe70a8
commit fedd9674c8
20 changed files with 447 additions and 232 deletions

View File

@@ -57,13 +57,13 @@ TEST(Algorithm, GpuArgSort) {
auto ctx = MakeCUDACtx(0);
dh::device_vector<float> values(20);
dh::Iota(dh::ToSpan(values)); // accending
dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream()); // accending
dh::device_vector<size_t> sorted_idx(20);
dh::ArgSort<false>(dh::ToSpan(values), dh::ToSpan(sorted_idx)); // sort to descending
ASSERT_TRUE(thrust::is_sorted(thrust::device, sorted_idx.begin(), sorted_idx.end(),
ArgSort<false>(&ctx, dh::ToSpan(values), dh::ToSpan(sorted_idx)); // sort to descending
ASSERT_TRUE(thrust::is_sorted(ctx.CUDACtx()->CTP(), sorted_idx.begin(), sorted_idx.end(),
thrust::greater<size_t>{}));
dh::Iota(dh::ToSpan(values));
dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream());
dh::device_vector<size_t> groups(3);
groups[0] = 0;
groups[1] = 10;