[EM] Add GPU version of the external memory QDM. (#10689)
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost Contributors
|
||||
*/
|
||||
#include "test_extmem_quantile_dmatrix.h" // for TestExtMemQdmBasic
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h> // for BatchParam
|
||||
|
||||
@@ -9,76 +11,30 @@
|
||||
#include "../../../src/common/column_matrix.h" // for ColumnMatrix
|
||||
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../helpers.h" // for RandomDataGenerator
|
||||
|
||||
namespace xgboost::data {
|
||||
namespace {
|
||||
class ExtMemQuantileDMatrixCpu : public ::testing::TestWithParam<float> {
|
||||
public:
|
||||
void Run(float sparsity) {
|
||||
bst_idx_t n_samples = 256, n_features = 16, n_batches = 4;
|
||||
bst_bin_t max_bin = 64;
|
||||
bst_target_t n_targets = 3;
|
||||
auto p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
|
||||
.Bins(max_bin)
|
||||
.Batches(n_batches)
|
||||
.Targets(n_targets)
|
||||
.GenerateExtMemQuantileDMatrix("temp", true);
|
||||
ASSERT_FALSE(p_fmat->SingleColBlock());
|
||||
|
||||
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
|
||||
Context ctx;
|
||||
|
||||
// Loop over the batches and count the number of pages
|
||||
bst_idx_t batch_cnt = 0;
|
||||
bst_idx_t base_cnt = 0;
|
||||
bst_idx_t row_cnt = 0;
|
||||
for (auto const& page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, p)) {
|
||||
ASSERT_EQ(page.base_rowid, base_cnt);
|
||||
++batch_cnt;
|
||||
base_cnt += n_samples / n_batches;
|
||||
row_cnt += page.Size();
|
||||
ASSERT_EQ((sparsity == 0.0f), page.IsDense());
|
||||
}
|
||||
ASSERT_EQ(n_batches, batch_cnt);
|
||||
ASSERT_EQ(p_fmat->Info().num_row_, n_samples);
|
||||
EXPECT_EQ(p_fmat->Info().num_row_, row_cnt);
|
||||
ASSERT_EQ(p_fmat->Info().num_col_, n_features);
|
||||
if (sparsity == 0.0f) {
|
||||
ASSERT_EQ(p_fmat->Info().num_nonzero_, n_samples * n_features);
|
||||
} else {
|
||||
ASSERT_LT(p_fmat->Info().num_nonzero_, n_samples * n_features);
|
||||
ASSERT_GT(p_fmat->Info().num_nonzero_, 0);
|
||||
}
|
||||
ASSERT_EQ(p_fmat->Info().labels.Shape(0), n_samples);
|
||||
ASSERT_EQ(p_fmat->Info().labels.Shape(1), n_targets);
|
||||
|
||||
// Compare against the sparse page DMatrix
|
||||
auto p_sparse = RandomDataGenerator{n_samples, n_features, sparsity}
|
||||
.Bins(max_bin)
|
||||
.Batches(n_batches)
|
||||
.Targets(n_targets)
|
||||
.GenerateSparsePageDMatrix("temp", true);
|
||||
auto it = p_fmat->GetBatches<GHistIndexMatrix>(&ctx, p).begin();
|
||||
for (auto const& page : p_sparse->GetBatches<GHistIndexMatrix>(&ctx, p)) {
|
||||
auto orig = it.Page();
|
||||
auto equal = [](Context const*, GHistIndexMatrix const& orig, GHistIndexMatrix const& sparse) {
|
||||
// Check the CSR matrix
|
||||
auto orig_cuts = it.Page()->Cuts();
|
||||
auto sparse_cuts = page.Cuts();
|
||||
auto orig_cuts = orig.Cuts();
|
||||
auto sparse_cuts = sparse.Cuts();
|
||||
ASSERT_EQ(orig_cuts.Values(), sparse_cuts.Values());
|
||||
ASSERT_EQ(orig_cuts.MinValues(), sparse_cuts.MinValues());
|
||||
ASSERT_EQ(orig_cuts.Ptrs(), sparse_cuts.Ptrs());
|
||||
|
||||
auto orig_ptr = orig->data.data();
|
||||
auto sparse_ptr = page.data.data();
|
||||
ASSERT_EQ(orig->data.size(), page.data.size());
|
||||
auto orig_ptr = orig.data.data();
|
||||
auto sparse_ptr = sparse.data.data();
|
||||
ASSERT_EQ(orig.data.size(), sparse.data.size());
|
||||
|
||||
auto equal = std::equal(orig_ptr, orig_ptr + orig->data.size(), sparse_ptr);
|
||||
auto equal = std::equal(orig_ptr, orig_ptr + orig.data.size(), sparse_ptr);
|
||||
ASSERT_TRUE(equal);
|
||||
|
||||
// Check the column matrix
|
||||
common::ColumnMatrix const& orig_columns = orig->Transpose();
|
||||
common::ColumnMatrix const& sparse_columns = page.Transpose();
|
||||
common::ColumnMatrix const& orig_columns = orig.Transpose();
|
||||
common::ColumnMatrix const& sparse_columns = sparse.Transpose();
|
||||
|
||||
std::string str_orig, str_sparse;
|
||||
common::AlignedMemWriteStream fo_orig{&str_orig}, fo_sparse{&str_sparse};
|
||||
@@ -86,18 +42,10 @@ class ExtMemQuantileDMatrixCpu : public ::testing::TestWithParam<float> {
|
||||
auto n_bytes_sparse = sparse_columns.Write(&fo_sparse);
|
||||
ASSERT_EQ(n_bytes_orig, n_bytes_sparse);
|
||||
ASSERT_EQ(str_orig, str_sparse);
|
||||
};
|
||||
|
||||
++it;
|
||||
}
|
||||
|
||||
// Check meta info
|
||||
auto h_y_sparse = p_sparse->Info().labels.HostView();
|
||||
auto h_y = p_fmat->Info().labels.HostView();
|
||||
for (std::size_t i = 0, m = h_y_sparse.Shape(0); i < m; ++i) {
|
||||
for (std::size_t j = 0, n = h_y_sparse.Shape(1); j < n; ++j) {
|
||||
ASSERT_EQ(h_y(i, j), h_y_sparse(i, j));
|
||||
}
|
||||
}
|
||||
Context ctx;
|
||||
TestExtMemQdmBasic<GHistIndexMatrix>(&ctx, false, sparsity, equal);
|
||||
}
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
45
tests/cpp/data/test_extmem_quantile_dmatrix.cu
Normal file
45
tests/cpp/data/test_extmem_quantile_dmatrix.cu
Normal file
@@ -0,0 +1,45 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h> // for BatchParam
|
||||
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../../src/data/ellpack_page.cuh" // for EllpackPageImpl
|
||||
#include "../helpers.h" // for RandomDataGenerator
|
||||
#include "test_extmem_quantile_dmatrix.h" // for TestExtMemQdmBasic
|
||||
|
||||
namespace xgboost::data {
|
||||
class ExtMemQuantileDMatrixGpu : public ::testing::TestWithParam<float> {
|
||||
public:
|
||||
void Run(float sparsity) {
|
||||
auto equal = [](Context const* ctx, EllpackPage const& orig, EllpackPage const& sparse) {
|
||||
auto const& orig_cuts = orig.Cuts();
|
||||
auto const& sparse_cuts = sparse.Cuts();
|
||||
ASSERT_EQ(orig_cuts.Values(), sparse_cuts.Values());
|
||||
ASSERT_EQ(orig_cuts.MinValues(), sparse_cuts.MinValues());
|
||||
ASSERT_EQ(orig_cuts.Ptrs(), sparse_cuts.Ptrs());
|
||||
|
||||
std::vector<common::CompressedByteT> h_orig, h_sparse;
|
||||
auto orig_acc = orig.Impl()->GetHostAccessor(ctx, &h_orig, {});
|
||||
auto sparse_acc = sparse.Impl()->GetHostAccessor(ctx, &h_sparse, {});
|
||||
ASSERT_EQ(h_orig.size(), h_sparse.size());
|
||||
|
||||
auto equal = std::equal(h_orig.cbegin(), h_orig.cend(), h_sparse.cbegin());
|
||||
ASSERT_TRUE(equal);
|
||||
};
|
||||
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
TestExtMemQdmBasic<EllpackPage>(&ctx, true, sparsity, equal);
|
||||
TestExtMemQdmBasic<EllpackPage>(&ctx, false, sparsity, equal);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(ExtMemQuantileDMatrixGpu, Basic) { this->Run(this->GetParam()); }
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, ExtMemQuantileDMatrixGpu, ::testing::ValuesIn([] {
|
||||
std::vector<float> sparsities{0.0f, 0.2f, 0.4f, 0.8f};
|
||||
return sparsities;
|
||||
}()));
|
||||
} // namespace xgboost::data
|
||||
73
tests/cpp/data/test_extmem_quantile_dmatrix.h
Normal file
73
tests/cpp/data/test_extmem_quantile_dmatrix.h
Normal file
@@ -0,0 +1,73 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost Contributors
|
||||
*/
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/context.h>
|
||||
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../helpers.h" // for RandomDataGenerator
|
||||
|
||||
namespace xgboost::data {
|
||||
template <typename Page, typename Equal>
|
||||
void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal) {
|
||||
bst_idx_t n_samples = 256, n_features = 16, n_batches = 4;
|
||||
bst_bin_t max_bin = 64;
|
||||
bst_target_t n_targets = 3;
|
||||
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
|
||||
|
||||
auto p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
|
||||
.Bins(max_bin)
|
||||
.Batches(n_batches)
|
||||
.Targets(n_targets)
|
||||
.Device(ctx->Device())
|
||||
.OnHost(on_host)
|
||||
.GenerateExtMemQuantileDMatrix("temp", true);
|
||||
ASSERT_FALSE(p_fmat->SingleColBlock());
|
||||
|
||||
// Loop over the batches and count the number of pages
|
||||
bst_idx_t batch_cnt = 0, base_cnt = 0, row_cnt = 0;
|
||||
for (auto const& page : p_fmat->GetBatches<Page>(ctx, p)) {
|
||||
ASSERT_EQ(page.BaseRowId(), base_cnt);
|
||||
++batch_cnt;
|
||||
base_cnt += n_samples / n_batches;
|
||||
row_cnt += page.Size();
|
||||
ASSERT_EQ((sparsity == 0.0f), page.IsDense());
|
||||
}
|
||||
ASSERT_EQ(n_batches, batch_cnt);
|
||||
ASSERT_EQ(p_fmat->Info().num_row_, n_samples);
|
||||
EXPECT_EQ(p_fmat->Info().num_row_, row_cnt);
|
||||
ASSERT_EQ(p_fmat->Info().num_col_, n_features);
|
||||
if (sparsity == 0.0f) {
|
||||
ASSERT_EQ(p_fmat->Info().num_nonzero_, n_samples * n_features);
|
||||
} else {
|
||||
ASSERT_LT(p_fmat->Info().num_nonzero_, n_samples * n_features);
|
||||
ASSERT_GT(p_fmat->Info().num_nonzero_, 0);
|
||||
}
|
||||
ASSERT_EQ(p_fmat->Info().labels.Shape(0), n_samples);
|
||||
ASSERT_EQ(p_fmat->Info().labels.Shape(1), n_targets);
|
||||
|
||||
// Compare against the sparse page DMatrix
|
||||
auto p_sparse = RandomDataGenerator{n_samples, n_features, sparsity}
|
||||
.Bins(max_bin)
|
||||
.Batches(n_batches)
|
||||
.Targets(n_targets)
|
||||
.Device(ctx->Device())
|
||||
.OnHost(on_host)
|
||||
.GenerateSparsePageDMatrix("temp", true);
|
||||
auto it = p_fmat->GetBatches<Page>(ctx, p).begin();
|
||||
for (auto const& page : p_sparse->GetBatches<Page>(ctx, p)) {
|
||||
auto orig = it.Page();
|
||||
check_equal(ctx, *orig, page);
|
||||
++it;
|
||||
}
|
||||
|
||||
// Check meta info
|
||||
auto h_y_sparse = p_sparse->Info().labels.HostView();
|
||||
auto h_y = p_fmat->Info().labels.HostView();
|
||||
for (std::size_t i = 0, m = h_y_sparse.Shape(0); i < m; ++i) {
|
||||
for (std::size_t j = 0, n = h_y_sparse.Shape(1); j < n; ++j) {
|
||||
ASSERT_EQ(h_y(i, j), h_y_sparse(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
@@ -483,12 +483,15 @@ void RandomDataGenerator::GenerateCSR(
|
||||
}
|
||||
CHECK(iter);
|
||||
|
||||
std::shared_ptr<DMatrix> p_fmat{
|
||||
DMatrix::Create(static_cast<DataIterHandle>(iter.get()), iter->Proxy(), nullptr, Reset, Next,
|
||||
std::numeric_limits<float>::quiet_NaN(), 0, this->bins_, prefix)};
|
||||
std::shared_ptr<DMatrix> p_fmat{DMatrix::Create(
|
||||
static_cast<DataIterHandle>(iter.get()), iter->Proxy(), nullptr, Reset, Next,
|
||||
std::numeric_limits<float>::quiet_NaN(), 0, this->bins_, prefix, this->on_host_)};
|
||||
|
||||
auto page_path = data::MakeId(prefix, p_fmat.get()) + ".gradient_index.page";
|
||||
EXPECT_TRUE(FileExists(page_path)) << page_path;
|
||||
auto page_path = data::MakeId(prefix, p_fmat.get());
|
||||
page_path += device_.IsCPU() ? ".gradient_index.page" : ".ellpack.page";
|
||||
if (!this->on_host_) {
|
||||
EXPECT_TRUE(FileExists(page_path)) << page_path;
|
||||
}
|
||||
|
||||
if (with_label) {
|
||||
RandomDataGenerator{static_cast<bst_idx_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}
|
||||
|
||||
Reference in New Issue
Block a user