[EM] Compress dense ellpack. (#10821)

This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used.

- Decouple the number of symbols needed in the compressor with the number of features when the data is dense.
- Remove the fetch call in the `at_end_` iteration.
- Reduce synchronization and kernel launches by using the `uvector` and ctx.
This commit is contained in:
Jiaming Yuan
2024-09-20 18:20:56 +08:00
committed by GitHub
parent d5e1c41b69
commit 24241ed6e3
28 changed files with 485 additions and 285 deletions

View File

@@ -73,13 +73,13 @@ TEST(Histogram, SubtractionTrack) {
histogram.AllocateHistograms(&ctx, {0, 1, 2});
GPUExpandEntry root;
root.nid = 0;
auto need_build = histogram.SubtractHist({root}, {0}, {1});
auto need_build = histogram.SubtractHist(&ctx, {root}, {0}, {1});
std::vector<GPUExpandEntry> candidates(2);
candidates[0].nid = 1;
candidates[1].nid = 2;
need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
need_build = histogram.SubtractHist(&ctx, candidates, {3, 5}, {4, 6});
ASSERT_EQ(need_build.size(), 2);
ASSERT_EQ(need_build[0], 4);
ASSERT_EQ(need_build[1], 6);

View File

@@ -33,9 +33,9 @@ void TestUpdatePositionBatch() {
std::vector<int> extra_data = {0};
// Send the first five training instances to the right node
// and the second 5 to the left node
rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
return ridx > 4;
});
rp.UpdatePositionBatch(
&ctx, {0}, {1}, {2}, extra_data,
[=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx > 4; });
rows = rp.GetRowsHost(1);
for (auto r : rows) {
EXPECT_GT(r, 4);
@@ -46,9 +46,9 @@ void TestUpdatePositionBatch() {
}
// Split the left node again
rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
return ridx < 7;
});
rp.UpdatePositionBatch(
&ctx, {1}, {3}, {4}, extra_data,
[=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx < 7; });
EXPECT_EQ(rp.GetRows(3).size(), 2);
EXPECT_EQ(rp.GetRows(4).size(), 3);
}
@@ -56,6 +56,7 @@ void TestUpdatePositionBatch() {
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
auto ctx = MakeCUDACtx(0);
thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
@@ -74,7 +75,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
nullptr));
dh::DeviceUVector<int8_t> tmp;
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
SortPositionBatch<decltype(op), int>(&ctx, dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
&tmp);
@@ -145,7 +146,7 @@ void TestExternalMemory() {
std::vector<RegTree::Node> splits{tree[0]};
auto acc = page.Impl()->GetDeviceAccessor(&ctx);
partitioners.back()->UpdatePositionBatch(
{0}, {1}, {2}, splits,
&ctx, {0}, {1}, {2}, splits,
[=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) {
auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
return fvalue <= node.SplitCond();