Enable compiling with system cub. (#7232)

- Tested with all CUDA 11.x. - Workaround cub scan by using discard iterator in AUC. - Limit the size of Argsort when compiled with CUDA cub.
2021-09-17 14:28:18 +08:00
parent b18f5f61b0
commit c311a8c1d8
6 changed files with 67 additions and 26 deletions
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -50,9 +50,6 @@ struct WriteResultsFunctor {
  }
 };

-// Change the value type of thrust discard iterator so we can use it with cub
-using DiscardOverload = thrust::discard_iterator<IndexFlagTuple>;
-
 // Implement partitioning via single scan operation using transform output to
 // write the result
 void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
@@ -64,7 +61,7 @@ void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
  WriteResultsFunctor write_results{left_nidx, position, position_out,
                                    ridx,      ridx_out, d_left_count};
  auto discard_write_iterator =
-      thrust::make_transform_output_iterator(DiscardOverload(), write_results);
+      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
  auto counting = thrust::make_counting_iterator(0llu);
  auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
      counting, [=] __device__(size_t idx) {