Remove unnecessary fetch operations in external memory. (#10342)
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost contributors
|
||||
* Copyright 2021-2024, XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/context.h> // for Context
|
||||
@@ -8,10 +8,10 @@
|
||||
#include <memory> // for unique_ptr
|
||||
|
||||
#include "../../../src/common/column_matrix.h"
|
||||
#include "../../../src/common/io.h" // for MmapResource, AlignedResourceReadStream...
|
||||
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "../../../src/data/sparse_page_source.h"
|
||||
#include "../helpers.h" // for RandomDataGenerator
|
||||
#include "../../../src/common/io.h" // for MmapResource, AlignedResourceReadStream...
|
||||
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "../../../src/data/sparse_page_writer.h" // for CreatePageFormat
|
||||
#include "../helpers.h" // for RandomDataGenerator
|
||||
|
||||
namespace xgboost::data {
|
||||
TEST(GHistIndexPageRawFormat, IO) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2016-2023 by XGBoost Contributors
|
||||
* Copyright 2016-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
@@ -115,9 +115,67 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
|
||||
TestRetainPage<SortedCSCPage>();
|
||||
}
|
||||
|
||||
// Test GHistIndexMatrix can avoid loading sparse page after the initialization.
|
||||
TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
|
||||
tmpdir.path + "/", true);
|
||||
Context ctx;
|
||||
bst_bin_t n_bins{256};
|
||||
double sparse_thresh{0.8};
|
||||
BatchParam batch_param{n_bins, sparse_thresh};
|
||||
|
||||
auto check_ghist = [&] {
|
||||
std::int32_t k = 0;
|
||||
for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch_param)) {
|
||||
ASSERT_EQ(page.Size(), 30);
|
||||
ASSERT_EQ(k, page.base_rowid);
|
||||
k += page.Size();
|
||||
}
|
||||
};
|
||||
check_ghist();
|
||||
|
||||
auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
|
||||
CHECK(casted);
|
||||
// Make the number of fetches don't change (no new fetch)
|
||||
auto n_init_fetches = casted->SparsePageFetchCount();
|
||||
|
||||
std::vector<float> hess(Xy->Info().num_row_, 1.0f);
|
||||
// Run multiple iterations to make sure fetches are consistent after reset.
|
||||
for (std::int32_t i = 0; i < 4; ++i) {
|
||||
auto n_fetches = casted->SparsePageFetchCount();
|
||||
check_ghist();
|
||||
ASSERT_EQ(casted->SparsePageFetchCount(), n_fetches);
|
||||
if (i == 0) {
|
||||
ASSERT_EQ(n_fetches, n_init_fetches);
|
||||
}
|
||||
// Make sure other page types don't interfere the GHist. This way, we can reuse the
|
||||
// DMatrix for multiple purposes.
|
||||
for ([[maybe_unused]] auto const &page : Xy->GetBatches<SparsePage>(&ctx)) {
|
||||
}
|
||||
for ([[maybe_unused]] auto const &page : Xy->GetBatches<SortedCSCPage>(&ctx)) {
|
||||
}
|
||||
for ([[maybe_unused]] auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch_param)) {
|
||||
}
|
||||
// Approx tree method pages
|
||||
{
|
||||
BatchParam regen{n_bins, common::Span{hess.data(), hess.size()}, false};
|
||||
for ([[maybe_unused]] auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, regen)) {
|
||||
}
|
||||
}
|
||||
{
|
||||
BatchParam regen{n_bins, common::Span{hess.data(), hess.size()}, true};
|
||||
for ([[maybe_unused]] auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, regen)) {
|
||||
}
|
||||
}
|
||||
// Restore the batch parameter by passing it in again through check_ghist
|
||||
check_ghist();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
const std::string tmp_file = tmpdir.path + "/simple.libsvm";
|
||||
size_t constexpr kEntries = 24;
|
||||
CreateBigTestData(tmp_file, kEntries);
|
||||
|
||||
|
||||
@@ -42,6 +42,36 @@ TEST(SparsePageDMatrix, EllpackPage) {
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
|
||||
// Test Ellpack can avoid loading sparse page after the initialization.
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
|
||||
tmpdir.path + "/", true);
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
bst_bin_t n_bins{256};
|
||||
double sparse_thresh{0.8};
|
||||
BatchParam batch_param{n_bins, sparse_thresh};
|
||||
|
||||
std::int32_t k = 0;
|
||||
for (auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
|
||||
auto impl = page.Impl();
|
||||
ASSERT_EQ(page.Size(), 30);
|
||||
ASSERT_EQ(k, impl->base_rowid);
|
||||
k += page.Size();
|
||||
}
|
||||
|
||||
auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
|
||||
CHECK(casted);
|
||||
// Make the number of fetches don't change (no new fetch)
|
||||
auto n_fetches = casted->SparsePageFetchCount();
|
||||
for (std::int32_t i = 0; i < 3; ++i) {
|
||||
for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
|
||||
}
|
||||
auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
|
||||
ASSERT_EQ(casted->SparsePageFetchCount(), n_fetches);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||
|
||||
Reference in New Issue
Block a user