Refactor SparsePageSource, delete cache files after use (#5321)
* Refactor sparse page source * Delete temporary cache files * Log fatal if cache exists * Log fatal if multiple threads used with prefetcher
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
// Copyright by Contributors
|
||||
#include <dmlc/filesystem.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../helpers.h"
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
@@ -12,8 +12,8 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", false, false);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", false, false);
|
||||
std::cout << tmp_file << std::endl;
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
|
||||
@@ -44,21 +44,21 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
xgboost::DMatrix *dmat =
|
||||
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
|
||||
EXPECT_EQ(dmat->GetColDensity(0), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
for (auto const& col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
|
||||
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
|
||||
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
|
||||
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_batch[1].size(), 1);
|
||||
}
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
for (auto const& col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||
EXPECT_EQ(col_batch.Size(), dmat->Info().num_col_);
|
||||
EXPECT_EQ(col_batch[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_batch[1].size(), 1);
|
||||
@@ -70,25 +70,61 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
||||
|
||||
delete dmat;
|
||||
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache"));
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.row.page"));
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, ExistingCacheFile) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||
xgboost::CreateSparsePageDMatrix(12, 64, filename);
|
||||
EXPECT_ANY_THROW({
|
||||
std::unique_ptr<xgboost::DMatrix> dmat2 =
|
||||
xgboost::CreateSparsePageDMatrix(12, 64, filename);
|
||||
});
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/test";
|
||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||
xgboost::CreateSparsePageDMatrix(12, 64, filename);
|
||||
|
||||
bool exception = false;
|
||||
int threads = 1000;
|
||||
#pragma omp parallel for
|
||||
for (auto i = 0; i < threads; i++) {
|
||||
try {
|
||||
auto iter = dmat->GetBatches<SparsePage>().begin();
|
||||
++iter;
|
||||
} catch (...) {
|
||||
exception = true;
|
||||
}
|
||||
}
|
||||
EXPECT_TRUE(exception);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Multi-batches access
|
||||
TEST(SparsePageDMatrix, ColAccessBatches) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
// Create multiple sparse pages
|
||||
std::unique_ptr<xgboost::DMatrix> dmat {
|
||||
xgboost::CreateSparsePageDMatrix(1024, 1024, filename)
|
||||
};
|
||||
std::unique_ptr<xgboost::DMatrix> dmat{
|
||||
xgboost::CreateSparsePageDMatrix(1024, 1024, filename)};
|
||||
auto n_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(16);
|
||||
for (auto const& page : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||
ASSERT_EQ(dmat->Info().num_col_, page.Size());
|
||||
}
|
||||
omp_set_num_threads(n_threads);
|
||||
}
|
||||
|
||||
|
||||
TEST(SparsePageDMatrix, Empty) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
@@ -96,34 +132,40 @@ TEST(SparsePageDMatrix, Empty) {
|
||||
std::vector<unsigned> feature_idx = {};
|
||||
std::vector<size_t> row_ptr = {};
|
||||
|
||||
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(), 0, 0, 0);
|
||||
data::SparsePageDMatrix dmat(&csr_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
{
|
||||
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(),
|
||||
data.data(), 0, 0, 0);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
data::DenseAdapter dense_adapter(nullptr, 0, 0);
|
||||
data::SparsePageDMatrix dmat2(&dense_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
|
||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat2.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat2.Info().num_col_, 0);
|
||||
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
{
|
||||
data::DenseAdapter dense_adapter(nullptr, 0, 0);
|
||||
data::SparsePageDMatrix dmat2(
|
||||
&dense_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat2.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat2.Info().num_col_, 0);
|
||||
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
|
||||
data::SparsePageDMatrix dmat3(&csc_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
|
||||
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat3.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat3.Info().num_col_, 0);
|
||||
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
{
|
||||
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
|
||||
data::SparsePageDMatrix dmat3(
|
||||
&csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat3.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat3.Info().num_col_, 0);
|
||||
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,12 +176,14 @@ TEST(SparsePageDMatrix, MissingData) {
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0};
|
||||
std::vector<size_t> row_ptr = {0, 2, 3};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 3, 2);
|
||||
data::SparsePageDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
|
||||
3, 2);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
|
||||
|
||||
const std::string tmp_file2 = tempdir.path + "/simple2.libsvm";
|
||||
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1,tmp_file2);
|
||||
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1, tmp_file2);
|
||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 1);
|
||||
}
|
||||
|
||||
@@ -150,8 +194,10 @@ TEST(SparsePageDMatrix, EmptyRow) {
|
||||
std::vector<unsigned> feature_idx = {0, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 2};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 2, 2);
|
||||
data::SparsePageDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
|
||||
2, 2);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
@@ -173,9 +219,8 @@ TEST(SparsePageDMatrix, FromDense) {
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for(auto j = 0ull; j < inst.size(); j++)
|
||||
{
|
||||
EXPECT_EQ(inst[j].fvalue, data[i*n+j]);
|
||||
for (auto j = 0ull; j < inst.size(); j++) {
|
||||
EXPECT_EQ(inst[j].fvalue, data[i * n + j]);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
@@ -215,9 +260,9 @@ TEST(SparsePageDMatrix, FromCSC) {
|
||||
|
||||
TEST(SparsePageDMatrix, FromFile) {
|
||||
std::string filename = "test.libsvm";
|
||||
CreateBigTestData(filename,20);
|
||||
CreateBigTestData(filename, 20);
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
|
||||
Reference in New Issue
Block a user