[EM] Compress dense ellpack. (#10821)

This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used. - Decouple the number of symbols needed in the compressor with the number of features when the data is dense. - Remove the fetch call in the `at_end_` iteration. - Reduce synchronization and kernel launches by using the `uvector` and ctx.
2024-09-20 18:20:56 +08:00
parent d5e1c41b69
commit 24241ed6e3
28 changed files with 485 additions and 285 deletions
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -73,13 +73,13 @@ TEST(Histogram, SubtractionTrack) {
  histogram.AllocateHistograms(&ctx, {0, 1, 2});
  GPUExpandEntry root;
  root.nid = 0;
-  auto need_build = histogram.SubtractHist({root}, {0}, {1});
+  auto need_build = histogram.SubtractHist(&ctx, {root}, {0}, {1});

  std::vector<GPUExpandEntry> candidates(2);
  candidates[0].nid = 1;
  candidates[1].nid = 2;

-  need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
+  need_build = histogram.SubtractHist(&ctx, candidates, {3, 5}, {4, 6});
  ASSERT_EQ(need_build.size(), 2);
  ASSERT_EQ(need_build[0], 4);
  ASSERT_EQ(need_build[1], 6);
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -33,9 +33,9 @@ void TestUpdatePositionBatch() {
  std::vector<int> extra_data = {0};
  // Send the first five training instances to the right node
  // and the second 5 to the left node
-  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
-    return ridx > 4;
-  });
+  rp.UpdatePositionBatch(
+      &ctx, {0}, {1}, {2}, extra_data,
+      [=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx > 4; });
  rows = rp.GetRowsHost(1);
  for (auto r : rows) {
    EXPECT_GT(r, 4);
@@ -46,9 +46,9 @@ void TestUpdatePositionBatch() {
  }

  // Split the left node again
-  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
-    return ridx < 7;
-  });
+  rp.UpdatePositionBatch(
+      &ctx, {1}, {3}, {4}, extra_data,
+      [=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx < 7; });
  EXPECT_EQ(rp.GetRows(3).size(), 2);
  EXPECT_EQ(rp.GetRows(4).size(), 3);
 }
@@ -56,6 +56,7 @@ void TestUpdatePositionBatch() {
 TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }

 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
+  auto ctx = MakeCUDACtx(0);
  thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
  thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
  thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
@@ -74,7 +75,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
                                h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
                                nullptr));
  dh::DeviceUVector<int8_t> tmp;
-  SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
+  SortPositionBatch<decltype(op), int>(&ctx, dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                       dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
                                       &tmp);

@@ -145,7 +146,7 @@ void TestExternalMemory() {
    std::vector<RegTree::Node> splits{tree[0]};
    auto acc = page.Impl()->GetDeviceAccessor(&ctx);
    partitioners.back()->UpdatePositionBatch(
-        {0}, {1}, {2}, splits,
+        &ctx, {0}, {1}, {2}, splits,
        [=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) {
          auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
          return fvalue <= node.SplitCond();