[EM] Compress dense ellpack. (#10821)

This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used.

- Decouple the number of symbols needed in the compressor with the number of features when the data is dense.
- Remove the fetch call in the `at_end_` iteration.
- Reduce synchronization and kernel launches by using the `uvector` and ctx.
This commit is contained in:
Jiaming Yuan
2024-09-20 18:20:56 +08:00
committed by GitHub
parent d5e1c41b69
commit 24241ed6e3
28 changed files with 485 additions and 285 deletions

View File

@@ -1,9 +1,9 @@
// Copyright (c) 2019 by Contributors
/**
* Copyright 2019-2024, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include "../../../src/data/adapter.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/common/timer.h"
#include "../helpers.h"
#include <thrust/device_vector.h>
#include "../../../src/data/device_adapter.cuh"
@@ -64,7 +64,7 @@ TEST(DeviceAdapter, GetRowCounts) {
auto adapter = CupyAdapter{str_arr};
HostDeviceVector<bst_idx_t> offset(adapter.NumRows() + 1, 0);
offset.SetDevice(ctx.Device());
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
auto rstride = GetRowCounts(&ctx, adapter.Value(), offset.DeviceSpan(), ctx.Device(),
std::numeric_limits<float>::quiet_NaN());
ASSERT_EQ(rstride, n_features);
}

View File

@@ -30,13 +30,13 @@ TEST(EllpackPage, EmptyDMatrix) {
}
TEST(EllpackPage, BuildGidxDense) {
int constexpr kNRows = 16, kNCols = 8;
bst_idx_t n_samples = 16, n_features = 8;
auto ctx = MakeCUDACtx(0);
auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
auto page = BuildEllpackPage(&ctx, n_samples, n_features);
std::vector<common::CompressedByteT> h_gidx_buffer;
auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
ASSERT_EQ(page->row_stride, kNCols);
ASSERT_EQ(page->row_stride, n_features);
std::vector<uint32_t> solution = {
0, 3, 8, 9, 14, 17, 20, 21,
@@ -56,8 +56,9 @@ TEST(EllpackPage, BuildGidxDense) {
2, 4, 8, 10, 14, 15, 19, 22,
1, 4, 7, 10, 14, 16, 19, 21,
};
for (size_t i = 0; i < kNRows * kNCols; ++i) {
ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
for (size_t i = 0; i < n_samples * n_features; ++i) {
auto fidx = i % n_features;
ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]);
}
}
@@ -263,12 +264,12 @@ class EllpackPageTest : public testing::TestWithParam<float> {
ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
}
}
}

View File

@@ -106,9 +106,11 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
common::Span<float const> s_data{static_cast<float const*>(loaded.data), cols * rows};
dh::CopyDeviceSpanToVector(&h_data, s_data);
for(auto i = 0ull; i < rows * cols; i++) {
auto cut_ptr = h_accessor.feature_segments;
for (auto i = 0ull; i < rows * cols; i++) {
int column_idx = i % cols;
EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]);
EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx),
h_accessor.gidx_iter[i] + cut_ptr[column_idx]);
}
EXPECT_EQ(m.Info().num_col_, cols);
EXPECT_EQ(m.Info().num_row_, rows);

View File

@@ -12,6 +12,7 @@
#include "../../../src/data/file_iterator.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/data/sparse_page_dmatrix.h"
#include "../../../src/tree/param.h" // for TrainParam
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
@@ -115,6 +116,47 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
TestRetainPage<SortedCSCPage>();
}
class TestGradientIndexExt : public ::testing::TestWithParam<bool> {
protected:
void Run(bool is_dense) {
constexpr bst_idx_t kRows = 64;
constexpr size_t kCols = 2;
float sparsity = is_dense ? 0.0 : 0.4;
bst_bin_t n_bins = 16;
Context ctx;
auto p_ext_fmat =
RandomDataGenerator{kRows, kCols, sparsity}.Batches(4).GenerateSparsePageDMatrix("temp",
true);
auto cuts = common::SketchOnDMatrix(&ctx, p_ext_fmat.get(), n_bins, false, {});
std::vector<std::unique_ptr<GHistIndexMatrix>> pages;
for (auto const &page : p_ext_fmat->GetBatches<SparsePage>()) {
pages.emplace_back(std::make_unique<GHistIndexMatrix>(
page, common::Span<FeatureType const>{}, cuts, n_bins, is_dense, 0.8, ctx.Threads()));
}
std::int32_t k = 0;
for (auto const &page : p_ext_fmat->GetBatches<GHistIndexMatrix>(
&ctx, BatchParam{n_bins, tree::TrainParam::DftSparseThreshold()})) {
auto const &from_sparse = pages[k];
ASSERT_TRUE(std::equal(page.index.begin(), page.index.end(), from_sparse->index.begin()));
if (is_dense) {
ASSERT_TRUE(std::equal(page.index.Offset(), page.index.Offset() + kCols,
from_sparse->index.Offset()));
} else {
ASSERT_FALSE(page.index.Offset());
ASSERT_FALSE(from_sparse->index.Offset());
}
ASSERT_TRUE(
std::equal(page.row_ptr.cbegin(), page.row_ptr.cend(), from_sparse->row_ptr.cbegin()));
++k;
}
}
};
TEST_P(TestGradientIndexExt, Basic) { this->Run(this->GetParam()); }
INSTANTIATE_TEST_SUITE_P(SparsePageDMatrix, TestGradientIndexExt, testing::Bool());
// Test GHistIndexMatrix can avoid loading sparse page after the initialization.
TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
dmlc::TemporaryDirectory tmpdir;

View File

@@ -40,10 +40,9 @@ TEST(SparsePageDMatrix, EllpackPage) {
TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
// Test Ellpack can avoid loading sparse page after the initialization.
dmlc::TemporaryDirectory tmpdir;
std::size_t n_batches = 6;
auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
tmpdir.path + "/", true);
auto Xy =
RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix("temp", true);
auto ctx = MakeCUDACtx(0);
auto cpu = ctx.MakeCPU();
bst_bin_t n_bins{256};
@@ -117,7 +116,6 @@ TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
TEST(SparsePageDMatrix, MultipleEllpackPages) {
auto ctx = MakeCUDACtx(0);
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
dmlc::TemporaryDirectory tmpdir;
auto dmat = RandomDataGenerator{1024, 2, 0.5f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
// Loop over the batches and count the records
@@ -155,18 +153,24 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
auto const& d_src = (*it).Impl()->gidx_buffer;
dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault));
}
ASSERT_GE(iterators.size(), 2);
ASSERT_EQ(iterators.size(), 8);
for (size_t i = 0; i < iterators.size(); ++i) {
std::vector<common::CompressedByteT> h_buf;
[[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
ASSERT_EQ(iterators[i].use_count(), 1);
// The last page is still kept in the DMatrix until Reset is called.
if (i == iterators.size() - 1) {
ASSERT_EQ(iterators[i].use_count(), 2);
} else {
ASSERT_EQ(iterators[i].use_count(), 1);
}
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
static_assert(std::is_const_v<std::remove_reference_t<decltype(page)>>);
break;
}
// The above iteration clears out all references inside DMatrix.
@@ -190,13 +194,10 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
auto p_fmat = RandomDataGenerator{kRows, kCols, sparsity}.GenerateDMatrix(true);
// Create a DMatrix with multiple batches.
dmlc::TemporaryDirectory tmpdir;
auto prefix = tmpdir.path + "/cache";
auto p_ext_fmat = RandomDataGenerator{kRows, kCols, sparsity}
.Batches(4)
.OnHost(on_host)
.GenerateSparsePageDMatrix(prefix, true);
.GenerateSparsePageDMatrix("temp", true);
auto param = BatchParam{2, tree::TrainParam::DftSparseThreshold()};
auto impl = (*p_fmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();