[EM] Compress dense ellpack. (#10821)
This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used. - Decouple the number of symbols needed in the compressor with the number of features when the data is dense. - Remove the fetch call in the `at_end_` iteration. - Reduce synchronization and kernel launches by using the `uvector` and ctx.
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
/**
|
||||
* Copyright 2019-2024, XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/common/timer.h"
|
||||
#include "../helpers.h"
|
||||
#include <thrust/device_vector.h>
|
||||
#include "../../../src/data/device_adapter.cuh"
|
||||
@@ -64,7 +64,7 @@ TEST(DeviceAdapter, GetRowCounts) {
|
||||
auto adapter = CupyAdapter{str_arr};
|
||||
HostDeviceVector<bst_idx_t> offset(adapter.NumRows() + 1, 0);
|
||||
offset.SetDevice(ctx.Device());
|
||||
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
|
||||
auto rstride = GetRowCounts(&ctx, adapter.Value(), offset.DeviceSpan(), ctx.Device(),
|
||||
std::numeric_limits<float>::quiet_NaN());
|
||||
ASSERT_EQ(rstride, n_features);
|
||||
}
|
||||
|
||||
@@ -30,13 +30,13 @@ TEST(EllpackPage, EmptyDMatrix) {
|
||||
}
|
||||
|
||||
TEST(EllpackPage, BuildGidxDense) {
|
||||
int constexpr kNRows = 16, kNCols = 8;
|
||||
bst_idx_t n_samples = 16, n_features = 8;
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
|
||||
auto page = BuildEllpackPage(&ctx, n_samples, n_features);
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||
auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
|
||||
|
||||
ASSERT_EQ(page->row_stride, kNCols);
|
||||
ASSERT_EQ(page->row_stride, n_features);
|
||||
|
||||
std::vector<uint32_t> solution = {
|
||||
0, 3, 8, 9, 14, 17, 20, 21,
|
||||
@@ -56,8 +56,9 @@ TEST(EllpackPage, BuildGidxDense) {
|
||||
2, 4, 8, 10, 14, 15, 19, 22,
|
||||
1, 4, 7, 10, 14, 16, 19, 21,
|
||||
};
|
||||
for (size_t i = 0; i < kNRows * kNCols; ++i) {
|
||||
ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
|
||||
for (size_t i = 0; i < n_samples * n_features; ++i) {
|
||||
auto fidx = i % n_features;
|
||||
ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -263,12 +264,12 @@ class EllpackPageTest : public testing::TestWithParam<float> {
|
||||
ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
|
||||
ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
|
||||
ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
|
||||
ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
|
||||
std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
|
||||
auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
|
||||
auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
|
||||
ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
|
||||
for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
|
||||
EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
|
||||
ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,9 +106,11 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
|
||||
common::Span<float const> s_data{static_cast<float const*>(loaded.data), cols * rows};
|
||||
dh::CopyDeviceSpanToVector(&h_data, s_data);
|
||||
|
||||
for(auto i = 0ull; i < rows * cols; i++) {
|
||||
auto cut_ptr = h_accessor.feature_segments;
|
||||
for (auto i = 0ull; i < rows * cols; i++) {
|
||||
int column_idx = i % cols;
|
||||
EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]);
|
||||
EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx),
|
||||
h_accessor.gidx_iter[i] + cut_ptr[column_idx]);
|
||||
}
|
||||
EXPECT_EQ(m.Info().num_col_, cols);
|
||||
EXPECT_EQ(m.Info().num_row_, rows);
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "../../../src/data/file_iterator.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||
#include "../helpers.h"
|
||||
|
||||
@@ -115,6 +116,47 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
|
||||
TestRetainPage<SortedCSCPage>();
|
||||
}
|
||||
|
||||
class TestGradientIndexExt : public ::testing::TestWithParam<bool> {
|
||||
protected:
|
||||
void Run(bool is_dense) {
|
||||
constexpr bst_idx_t kRows = 64;
|
||||
constexpr size_t kCols = 2;
|
||||
float sparsity = is_dense ? 0.0 : 0.4;
|
||||
bst_bin_t n_bins = 16;
|
||||
Context ctx;
|
||||
auto p_ext_fmat =
|
||||
RandomDataGenerator{kRows, kCols, sparsity}.Batches(4).GenerateSparsePageDMatrix("temp",
|
||||
true);
|
||||
|
||||
auto cuts = common::SketchOnDMatrix(&ctx, p_ext_fmat.get(), n_bins, false, {});
|
||||
std::vector<std::unique_ptr<GHistIndexMatrix>> pages;
|
||||
for (auto const &page : p_ext_fmat->GetBatches<SparsePage>()) {
|
||||
pages.emplace_back(std::make_unique<GHistIndexMatrix>(
|
||||
page, common::Span<FeatureType const>{}, cuts, n_bins, is_dense, 0.8, ctx.Threads()));
|
||||
}
|
||||
std::int32_t k = 0;
|
||||
for (auto const &page : p_ext_fmat->GetBatches<GHistIndexMatrix>(
|
||||
&ctx, BatchParam{n_bins, tree::TrainParam::DftSparseThreshold()})) {
|
||||
auto const &from_sparse = pages[k];
|
||||
ASSERT_TRUE(std::equal(page.index.begin(), page.index.end(), from_sparse->index.begin()));
|
||||
if (is_dense) {
|
||||
ASSERT_TRUE(std::equal(page.index.Offset(), page.index.Offset() + kCols,
|
||||
from_sparse->index.Offset()));
|
||||
} else {
|
||||
ASSERT_FALSE(page.index.Offset());
|
||||
ASSERT_FALSE(from_sparse->index.Offset());
|
||||
}
|
||||
ASSERT_TRUE(
|
||||
std::equal(page.row_ptr.cbegin(), page.row_ptr.cend(), from_sparse->row_ptr.cbegin()));
|
||||
++k;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(TestGradientIndexExt, Basic) { this->Run(this->GetParam()); }
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(SparsePageDMatrix, TestGradientIndexExt, testing::Bool());
|
||||
|
||||
// Test GHistIndexMatrix can avoid loading sparse page after the initialization.
|
||||
TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
|
||||
@@ -40,10 +40,9 @@ TEST(SparsePageDMatrix, EllpackPage) {
|
||||
|
||||
TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
|
||||
// Test Ellpack can avoid loading sparse page after the initialization.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::size_t n_batches = 6;
|
||||
auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
|
||||
tmpdir.path + "/", true);
|
||||
auto Xy =
|
||||
RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix("temp", true);
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto cpu = ctx.MakeCPU();
|
||||
bst_bin_t n_bins{256};
|
||||
@@ -117,7 +116,6 @@ TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
|
||||
TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto dmat = RandomDataGenerator{1024, 2, 0.5f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
// Loop over the batches and count the records
|
||||
@@ -155,18 +153,24 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
|
||||
auto const& d_src = (*it).Impl()->gidx_buffer;
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault));
|
||||
}
|
||||
ASSERT_GE(iterators.size(), 2);
|
||||
ASSERT_EQ(iterators.size(), 8);
|
||||
|
||||
for (size_t i = 0; i < iterators.size(); ++i) {
|
||||
std::vector<common::CompressedByteT> h_buf;
|
||||
[[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
|
||||
ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
|
||||
ASSERT_EQ(iterators[i].use_count(), 1);
|
||||
// The last page is still kept in the DMatrix until Reset is called.
|
||||
if (i == iterators.size() - 1) {
|
||||
ASSERT_EQ(iterators[i].use_count(), 2);
|
||||
} else {
|
||||
ASSERT_EQ(iterators[i].use_count(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
// make sure it's const and the caller can not modify the content of page.
|
||||
for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
|
||||
static_assert(std::is_const_v<std::remove_reference_t<decltype(page)>>);
|
||||
break;
|
||||
}
|
||||
|
||||
// The above iteration clears out all references inside DMatrix.
|
||||
@@ -190,13 +194,10 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
|
||||
auto p_fmat = RandomDataGenerator{kRows, kCols, sparsity}.GenerateDMatrix(true);
|
||||
|
||||
// Create a DMatrix with multiple batches.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto prefix = tmpdir.path + "/cache";
|
||||
|
||||
auto p_ext_fmat = RandomDataGenerator{kRows, kCols, sparsity}
|
||||
.Batches(4)
|
||||
.OnHost(on_host)
|
||||
.GenerateSparsePageDMatrix(prefix, true);
|
||||
.GenerateSparsePageDMatrix("temp", true);
|
||||
|
||||
auto param = BatchParam{2, tree::TrainParam::DftSparseThreshold()};
|
||||
auto impl = (*p_fmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
||||
|
||||
Reference in New Issue
Block a user