[EM] Compress dense ellpack. (#10821)
This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used. - Decouple the number of symbols needed in the compressor with the number of features when the data is dense. - Remove the fetch call in the `at_end_` iteration. - Reduce synchronization and kernel launches by using the `uvector` and ctx.
This commit is contained in:
@@ -73,13 +73,13 @@ TEST(Histogram, SubtractionTrack) {
|
||||
histogram.AllocateHistograms(&ctx, {0, 1, 2});
|
||||
GPUExpandEntry root;
|
||||
root.nid = 0;
|
||||
auto need_build = histogram.SubtractHist({root}, {0}, {1});
|
||||
auto need_build = histogram.SubtractHist(&ctx, {root}, {0}, {1});
|
||||
|
||||
std::vector<GPUExpandEntry> candidates(2);
|
||||
candidates[0].nid = 1;
|
||||
candidates[1].nid = 2;
|
||||
|
||||
need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
|
||||
need_build = histogram.SubtractHist(&ctx, candidates, {3, 5}, {4, 6});
|
||||
ASSERT_EQ(need_build.size(), 2);
|
||||
ASSERT_EQ(need_build[0], 4);
|
||||
ASSERT_EQ(need_build[1], 6);
|
||||
|
||||
@@ -33,9 +33,9 @@ void TestUpdatePositionBatch() {
|
||||
std::vector<int> extra_data = {0};
|
||||
// Send the first five training instances to the right node
|
||||
// and the second 5 to the left node
|
||||
rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
|
||||
return ridx > 4;
|
||||
});
|
||||
rp.UpdatePositionBatch(
|
||||
&ctx, {0}, {1}, {2}, extra_data,
|
||||
[=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx > 4; });
|
||||
rows = rp.GetRowsHost(1);
|
||||
for (auto r : rows) {
|
||||
EXPECT_GT(r, 4);
|
||||
@@ -46,9 +46,9 @@ void TestUpdatePositionBatch() {
|
||||
}
|
||||
|
||||
// Split the left node again
|
||||
rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
|
||||
return ridx < 7;
|
||||
});
|
||||
rp.UpdatePositionBatch(
|
||||
&ctx, {1}, {3}, {4}, extra_data,
|
||||
[=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx < 7; });
|
||||
EXPECT_EQ(rp.GetRows(3).size(), 2);
|
||||
EXPECT_EQ(rp.GetRows(4).size(), 3);
|
||||
}
|
||||
@@ -56,6 +56,7 @@ void TestUpdatePositionBatch() {
|
||||
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
|
||||
|
||||
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
|
||||
thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
|
||||
thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
|
||||
@@ -74,7 +75,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
|
||||
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
|
||||
nullptr));
|
||||
dh::DeviceUVector<int8_t> tmp;
|
||||
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
|
||||
SortPositionBatch<decltype(op), int>(&ctx, dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
|
||||
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
|
||||
&tmp);
|
||||
|
||||
@@ -145,7 +146,7 @@ void TestExternalMemory() {
|
||||
std::vector<RegTree::Node> splits{tree[0]};
|
||||
auto acc = page.Impl()->GetDeviceAccessor(&ctx);
|
||||
partitioners.back()->UpdatePositionBatch(
|
||||
{0}, {1}, {2}, splits,
|
||||
&ctx, {0}, {1}, {2}, splits,
|
||||
[=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) {
|
||||
auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
|
||||
return fvalue <= node.SplitCond();
|
||||
|
||||
Reference in New Issue
Block a user