Deterministic data partitioning for external memory (#6317)

* Make external memory data partitioning deterministic.

* Change the meaning of `page_size` from bytes to number of rows.

* Design a data pool.

* Note for external memory.

* Enable unity build on Windows CI.

* Force garbage collect on test.
This commit is contained in:
Jiaming Yuan
2020-11-11 06:11:06 +08:00
committed by GitHub
parent 9564886d9f
commit 43efadea2e
15 changed files with 334 additions and 88 deletions

View File

@@ -130,8 +130,10 @@ TEST(DenseColumnWithMissing, Test) {
void TestGHistIndexMatrixCreation(size_t nthreads) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
/* This should create multiple sparse pages */
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(1024, 1024, filename) };
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries, kPageSize, filename) };
omp_set_num_threads(nthreads);
GHistIndexMatrix gmat;
gmat.Init(dmat.get(), 256);

View File

@@ -44,13 +44,13 @@ TEST(SparsePage, PushCSC) {
}
auto inst = page[0];
ASSERT_EQ(inst.size(), 2);
ASSERT_EQ(inst.size(), 2ul);
for (auto entry : inst) {
ASSERT_EQ(entry.index, 0);
ASSERT_EQ(entry.index, 0u);
}
inst = page[1];
ASSERT_EQ(inst.size(), 6);
ASSERT_EQ(inst.size(), 6ul);
std::vector<size_t> indices_sol {1, 2, 3};
for (size_t i = 0; i < inst.size(); ++i) {
ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
@@ -58,15 +58,12 @@ TEST(SparsePage, PushCSC) {
}
TEST(SparsePage, PushCSCAfterTranspose) {
#if defined(__APPLE__)
LOG(WARNING) << "FIXME(trivialfis): Skipping `PushCSCAfterTranspose' for APPLE.";
return;
#endif
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
const int n_entries = 9;
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat =
CreateSparsePageDMatrix(n_entries, 64UL, filename);
CreateSparsePageDMatrix(kEntries, 64UL, filename);
const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
@@ -76,7 +73,7 @@ TEST(SparsePage, PushCSCAfterTranspose) {
}
// Make sure that the final sparse page has the right number of entries
ASSERT_EQ(n_entries, page.data.Size());
ASSERT_EQ(kEntries, page.data.Size());
// The feature value for a feature in each row should be identical, as that is
// how the dmatrix has been created

View File

@@ -2,6 +2,7 @@
#include <dmlc/filesystem.h>
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include "../../../src/common/io.h"
#include "../../../src/data/adapter.h"
#include "../../../src/data/sparse_page_dmatrix.h"
#include "../helpers.h"
@@ -11,16 +12,18 @@ using namespace xgboost; // NOLINT
TEST(SparsePageDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
size_t constexpr kEntries = 24;
CreateBigTestData(tmp_file, kEntries);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", false, false);
std::cout << tmp_file << std::endl;
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
// Test the metadata that was parsed
EXPECT_EQ(dmat->Info().num_row_, 2);
EXPECT_EQ(dmat->Info().num_col_, 5);
EXPECT_EQ(dmat->Info().num_nonzero_, 6);
EXPECT_EQ(dmat->Info().num_row_, 8ul);
EXPECT_EQ(dmat->Info().num_col_, 5ul);
EXPECT_EQ(dmat->Info().num_nonzero_, kEntries);
EXPECT_EQ(dmat->Info().labels_.Size(), dmat->Info().num_row_);
delete dmat;
@@ -30,13 +33,13 @@ TEST(SparsePageDMatrix, RowAccess) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(12, 64, filename);
xgboost::CreateSparsePageDMatrix(24, 4, filename);
// Test the data read into the first row
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
auto first_row = batch[0];
ASSERT_EQ(first_row.size(), 3);
EXPECT_EQ(first_row[2].index, 2);
ASSERT_EQ(first_row.size(), 3ul);
EXPECT_EQ(first_row[2].index, 2u);
EXPECT_EQ(first_row[2].fvalue, 20);
}
@@ -77,43 +80,56 @@ TEST(SparsePageDMatrix, ColAccess) {
TEST(SparsePageDMatrix, ExistingCacheFile) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(12, 64, filename);
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
EXPECT_ANY_THROW({
std::unique_ptr<xgboost::DMatrix> dmat2 =
xgboost::CreateSparsePageDMatrix(12, 64, filename);
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
});
}
#if defined(_OPENMP)
TEST(SparsePageDMatrix, ThreadSafetyException) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/test";
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(12, 64, filename);
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
bool exception = false;
std::unique_ptr<xgboost::DMatrix> dmat =
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
std::atomic<bool> exception {false};
int threads = 1000;
#pragma omp parallel for num_threads(threads)
for (auto i = 0; i < threads; i++) {
try {
auto iter = dmat->GetBatches<SparsePage>().begin();
++iter;
} catch (...) {
exception = true;
}
std::vector<std::thread> waiting;
for (int32_t i = 0; i < threads; ++i) {
waiting.emplace_back([&]() {
try {
auto iter = dmat->GetBatches<SparsePage>().begin();
++iter;
} catch (...) {
exception = true;
}
});
}
for (auto& t : waiting) {
t.join();
}
EXPECT_TRUE(exception);
}
#endif
// Multi-batches access
TEST(SparsePageDMatrix, ColAccessBatches) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
// Create multiple sparse pages
std::unique_ptr<xgboost::DMatrix> dmat{
xgboost::CreateSparsePageDMatrix(1024, 1024, filename)};
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename)};
auto n_threads = omp_get_max_threads();
omp_set_num_threads(16);
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
@@ -286,3 +302,70 @@ TEST(SparsePageDMatrix, FromFile) {
}
}
}
TEST(SparsePageDMatrix, Large) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 1 << 16);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 16)};
std::unique_ptr<DMatrix> simple{DMatrix::Load(filename, true, true)};
std::vector<float> sparse_data;
std::vector<size_t> sparse_rptr;
std::vector<bst_feature_t> sparse_cids;
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
std::vector<float> simple_data;
std::vector<size_t> simple_rptr;
std::vector<bst_feature_t> simple_cids;
DMatrixToCSR(simple.get(), &simple_data, &simple_rptr, &simple_cids);
ASSERT_EQ(sparse_rptr.size(), sparse->Info().num_row_ + 1);
ASSERT_EQ(sparse_rptr.size(), simple->Info().num_row_ + 1);
ASSERT_EQ(sparse_data.size(), simple_data.size());
ASSERT_EQ(sparse_data, simple_data);
ASSERT_EQ(sparse_rptr.size(), simple_rptr.size());
ASSERT_EQ(sparse_rptr, simple_rptr);
ASSERT_EQ(sparse_cids, simple_cids);
}
auto TestSparsePageDMatrixDeterminism(int32_t threads, std::string const& filename) {
omp_set_num_threads(threads);
std::vector<float> sparse_data;
std::vector<size_t> sparse_rptr;
std::vector<bst_feature_t> sparse_cids;
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1 << 8)};
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
std::string cache_name = tmp_file + ".row.page";
std::string cache = common::LoadSequentialFile(cache_name);
return cache;
}
TEST(SparsePageDMatrix, Determinism) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 1 << 16);
std::vector<std::string> caches;
for (size_t i = 1; i < 18; i += 2) {
caches.emplace_back(TestSparsePageDMatrixDeterminism(i, filename));
}
for (size_t i = 1; i < caches.size(); ++i) {
ASSERT_EQ(caches[i], caches.front());
}
}

View File

@@ -28,7 +28,9 @@ TEST(SparsePageDMatrix, EllpackPage) {
TEST(SparsePageDMatrix, MultipleEllpackPages) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
// Loop over the batches and count the records
int64_t batch_count = 0;

View File

@@ -373,12 +373,8 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
batch_count++;
row_count += batch.Size();
}
#if defined(_OPENMP)
EXPECT_GE(batch_count, 2);
EXPECT_EQ(row_count, dmat->Info().num_row_);
#else
#warning "External memory doesn't work with Non-OpenMP build "
#endif // defined(_OPENMP)
return dmat;
}
@@ -495,6 +491,36 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
return gbm;
}
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
std::vector<size_t> *p_row_ptr,
std::vector<bst_feature_t> *p_cids) {
auto &data = *p_data;
auto &row_ptr = *p_row_ptr;
auto &cids = *p_cids;
data.resize(dmat->Info().num_nonzero_);
cids.resize(data.size());
row_ptr.resize(dmat->Info().num_row_ + 1);
SparsePage page;
for (const auto &batch : dmat->GetBatches<SparsePage>()) {
page.Push(batch);
}
auto const& in_offset = page.offset.HostVector();
auto const& in_data = page.data.HostVector();
CHECK_EQ(in_offset.size(), row_ptr.size());
std::copy(in_offset.cbegin(), in_offset.cend(), row_ptr.begin());
ASSERT_EQ(in_data.size(), data.size());
std::transform(in_data.cbegin(), in_data.cend(), data.begin(), [](Entry const& e) {
return e.fvalue;
});
ASSERT_EQ(in_data.size(), cids.size());
std::transform(in_data.cbegin(), in_data.cend(), cids.begin(), [](Entry const& e) {
return e.index;
});
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
using CUDAMemoryResource = rmm::mr::cuda_memory_resource;

View File

@@ -365,6 +365,10 @@ class CudaArrayIterForTest {
auto Proxy() -> decltype(proxy_) { return proxy_; }
};
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
std::vector<size_t> *p_row_ptr,
std::vector<bst_feature_t> *p_cids);
typedef void *DataIterHandle; // NOLINT(*)
inline void Reset(DataIterHandle self) {

View File

@@ -16,8 +16,8 @@ TEST(CpuPredictor, Basic) {
std::unique_ptr<Predictor> cpu_predictor =
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
int kRows = 5;
int kCols = 5;
size_t constexpr kRows = 5;
size_t constexpr kCols = 5;
LearnerModelParam param;
param.num_feature = kCols;
@@ -85,7 +85,11 @@ TEST(CpuPredictor, Basic) {
TEST(CpuPredictor, ExternalMemory) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
auto lparam = CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Predictor> cpu_predictor =

View File

@@ -105,9 +105,9 @@ TEST(GPUPredictor, ExternalMemoryTest) {
std::string file0 = tmpdir.path + "/big_0.libsvm";
std::string file1 = tmpdir.path + "/big_1.libsvm";
std::string file2 = tmpdir.path + "/big_2.libsvm";
dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
dmats.push_back(CreateSparsePageDMatrix(400, 64UL, file0));
dmats.push_back(CreateSparsePageDMatrix(800, 128UL, file1));
dmats.push_back(CreateSparsePageDMatrix(8000, 1024UL, file2));
for (const auto& dmat: dmats) {
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);

View File

@@ -1,5 +1,6 @@
import numpy as np
import sys
import gc
import pytest
import xgboost as xgb
from hypothesis import given, strategies, assume, settings, note
@@ -118,7 +119,10 @@ class TestGPUUpdaters:
assume(len(dataset.y) > 0)
param['tree_method'] = 'gpu_hist'
param = dataset.set_params(param)
external_result = train_result(param, dataset.get_external_dmat(), num_rounds)
m = dataset.get_external_dmat()
external_result = train_result(param, m, num_rounds)
del m
gc.collect()
assert tm.non_increasing(external_result['train'][dataset.metric])
def test_empty_dmatrix_prediction(self):