Rewrite sparse dmatrix using callbacks. (#7092)
- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves. - Remove use of threaded iterator and IO queue. - Remove `page_size`. - Make sure the number of pages in memory is bounded. - Make sure the cache can not be violated. - Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
@@ -125,12 +125,10 @@ TEST(DenseColumnWithMissing, Test) {
|
||||
}
|
||||
|
||||
void TestGHistIndexMatrixCreation(size_t nthreads) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
/* This should create multiple sparse pages */
|
||||
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries, kPageSize, filename) };
|
||||
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(kEntries) };
|
||||
omp_set_num_threads(nthreads);
|
||||
GHistIndexMatrix gmat(dmat.get(), 256);
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
|
||||
}
|
||||
fo.close();
|
||||
return std::shared_ptr<DMatrix>(DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
|
||||
}
|
||||
|
||||
// Test that elements are approximately equally distributed among bins
|
||||
|
||||
@@ -59,12 +59,9 @@ TEST(SparsePage, PushCSC) {
|
||||
}
|
||||
|
||||
TEST(SparsePage, PushCSCAfterTranspose) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
std::unique_ptr<DMatrix> dmat =
|
||||
CreateSparsePageDMatrix(kEntries, 64UL, filename);
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
|
||||
const int ncols = dmat->Info().num_col_;
|
||||
SparsePage page; // Consolidated sparse page
|
||||
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
||||
@@ -76,12 +73,12 @@ TEST(SparsePage, PushCSCAfterTranspose) {
|
||||
// Make sure that the final sparse page has the right number of entries
|
||||
ASSERT_EQ(kEntries, page.data.Size());
|
||||
|
||||
// The feature value for a feature in each row should be identical, as that is
|
||||
// how the dmatrix has been created
|
||||
for (size_t i = 0; i < page.Size(); ++i) {
|
||||
auto inst = page.GetView()[i];
|
||||
for (size_t j = 1; j < inst.size(); ++j) {
|
||||
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);
|
||||
page.SortRows();
|
||||
auto v = page.GetView();
|
||||
for (size_t i = 0; i < v.Size(); ++i) {
|
||||
auto column = v[i];
|
||||
for (size_t j = 1; j < column.size(); ++j) {
|
||||
ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ TEST(EllpackPage, Copy) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
BatchParam param{0, 256, kPageSize};
|
||||
BatchParam param{0, 256};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
|
||||
// Create an empty result page.
|
||||
@@ -188,7 +188,7 @@ TEST(EllpackPage, Compact) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
BatchParam param{0, 256, kPageSize};
|
||||
BatchParam param{0, 256};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
|
||||
// Create an empty result page.
|
||||
@@ -212,7 +212,7 @@ TEST(EllpackPage, Compact) {
|
||||
std::vector<bst_float> row_result(kCols);
|
||||
for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
|
||||
auto impl = page.Impl();
|
||||
EXPECT_EQ(impl->base_rowid, current_row);
|
||||
ASSERT_EQ(impl->base_rowid, current_row);
|
||||
|
||||
for (size_t i = 0; i < impl->Size(); i++) {
|
||||
size_t compacted_row = row_indexes_h[current_row];
|
||||
|
||||
46
tests/cpp/data/test_file_iterator.cc
Normal file
46
tests/cpp/data/test_file_iterator.cc
Normal file
@@ -0,0 +1,46 @@
|
||||
/*!
|
||||
* Copyright 2021 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <dmlc/filesystem.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "../../../src/data/file_iterator.h"
|
||||
#include "../../../src/data/proxy_dmatrix.h"
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
TEST(FileIterator, Basic) {
|
||||
auto check_n_features = [](FileIterator *iter) {
|
||||
size_t n_features = 0;
|
||||
iter->Reset();
|
||||
while (iter->Next()) {
|
||||
auto proxy = MakeProxy(iter->Proxy());
|
||||
auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
|
||||
n_features = std::max(n_features, csr->NumColumns());
|
||||
}
|
||||
ASSERT_EQ(n_features, 5);
|
||||
};
|
||||
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
{
|
||||
auto zpath = tmpdir.path + "/0-based.svm";
|
||||
CreateBigTestData(zpath, 3 * 64, true);
|
||||
zpath += "?indexing_mode=0";
|
||||
FileIterator iter{zpath, 0, 1, "libsvm"};
|
||||
check_n_features(&iter);
|
||||
}
|
||||
|
||||
{
|
||||
auto opath = tmpdir.path + "/1-based.svm";
|
||||
CreateBigTestData(opath, 3 * 64, false);
|
||||
opath += "?indexing_mode=1";
|
||||
FileIterator iter{opath, 0, 1, "libsvm"};
|
||||
check_n_features(&iter);
|
||||
}
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
@@ -142,7 +142,7 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
|
||||
IterativeDeviceDMatrix m(
|
||||
&iter, iter.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(),
|
||||
0, 256);
|
||||
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256, 0}).begin();
|
||||
auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
|
||||
auto impl = ellpack.Impl();
|
||||
common::CompressedIterator<uint32_t> iterator(
|
||||
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
||||
|
||||
@@ -260,7 +260,7 @@ TEST(MetaInfo, HostExtend) {
|
||||
lhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
||||
rhs.SetInfo("group", groups.data(), xgboost::DataType::kUInt32, groups.size());
|
||||
|
||||
lhs.Extend(rhs, true);
|
||||
lhs.Extend(rhs, true, true);
|
||||
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
||||
ASSERT_TRUE(lhs.labels_.HostCanRead());
|
||||
ASSERT_TRUE(rhs.labels_.HostCanRead());
|
||||
|
||||
@@ -141,7 +141,7 @@ TEST(MetaInfo, DeviceExtend) {
|
||||
lhs.num_row_ = kRows;
|
||||
rhs.num_row_ = kRows;
|
||||
|
||||
lhs.Extend(rhs, true);
|
||||
lhs.Extend(rhs, true, true);
|
||||
ASSERT_EQ(lhs.num_row_, kRows * 2);
|
||||
ASSERT_FALSE(lhs.labels_.HostCanRead());
|
||||
|
||||
|
||||
@@ -6,11 +6,100 @@
|
||||
#include <future>
|
||||
#include "../../../src/common/io.h"
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||
#include "../../../src/data/file_iterator.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
template <typename Page>
|
||||
void TestSparseDMatrixLoadFile() {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto opath = tmpdir.path + "/1-based.svm";
|
||||
CreateBigTestData(opath, 3 * 64, false);
|
||||
opath += "?indexing_mode=1";
|
||||
data::FileIterator iter{opath, 0, 1, "libsvm"};
|
||||
data::SparsePageDMatrix m{&iter,
|
||||
iter.Proxy(),
|
||||
data::fileiter::Reset,
|
||||
data::fileiter::Next,
|
||||
std::numeric_limits<float>::quiet_NaN(),
|
||||
1,
|
||||
"cache"};
|
||||
ASSERT_EQ(m.Info().num_col_, 5);
|
||||
ASSERT_EQ(m.Info().num_row_, 64);
|
||||
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||
dmlc::Parser<uint32_t>::Create(opath.c_str(), 0, 1, "auto"));
|
||||
auto adapter = data::FileAdapter{parser.get()};
|
||||
|
||||
data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
1};
|
||||
Page out;
|
||||
for (auto const& page : m.GetBatches<Page>()) {
|
||||
if (std::is_same<Page, SparsePage>::value) {
|
||||
out.Push(page);
|
||||
} else {
|
||||
out.PushCSC(page);
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
|
||||
ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
|
||||
|
||||
for (auto const& page : simple.GetBatches<Page>()) {
|
||||
ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
|
||||
for (size_t i = 0; i < page.data.Size(); ++i) {
|
||||
ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, LoadFile) {
|
||||
TestSparseDMatrixLoadFile<SparsePage>();
|
||||
TestSparseDMatrixLoadFile<CSCPage>();
|
||||
TestSparseDMatrixLoadFile<SortedCSCPage>();
|
||||
}
|
||||
|
||||
// allow caller to retain pages so they can process multiple pages at the same time.
|
||||
template <typename Page>
|
||||
void TestRetainPage() {
|
||||
auto m = CreateSparsePageDMatrix(10000);
|
||||
auto batches = m->GetBatches<Page>();
|
||||
auto begin = batches.begin();
|
||||
auto end = batches.end();
|
||||
|
||||
std::vector<Page> pages;
|
||||
std::vector<std::shared_ptr<Page const>> iterators;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
iterators.push_back(it.Page());
|
||||
pages.emplace_back(Page{});
|
||||
if (std::is_same<Page, SparsePage>::value) {
|
||||
pages.back().Push(*it);
|
||||
} else {
|
||||
pages.back().PushCSC(*it);
|
||||
}
|
||||
ASSERT_EQ(pages.back().Size(), (*it).Size());
|
||||
}
|
||||
ASSERT_GE(iterators.size(), 2);
|
||||
|
||||
for (size_t i = 0; i < iterators.size(); ++i) {
|
||||
ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
|
||||
ASSERT_EQ((*iterators[i]).data.HostVector(), pages.at(i).data.HostVector());
|
||||
}
|
||||
|
||||
// make sure it's const and the caller can not modify the content of page.
|
||||
for (auto& page : m->GetBatches<Page>()) {
|
||||
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RetainSparsePage) {
|
||||
TestRetainPage<SparsePage>();
|
||||
TestRetainPage<CSCPage>();
|
||||
TestRetainPage<SortedCSCPage>();
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
@@ -19,8 +108,6 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", false, false);
|
||||
std::cout << tmp_file << std::endl;
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
|
||||
// Test the metadata that was parsed
|
||||
EXPECT_EQ(dmat->Info().num_row_, 8ul);
|
||||
@@ -32,10 +119,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RowAccess) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||
xgboost::CreateSparsePageDMatrix(24, 4, filename);
|
||||
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(24);
|
||||
|
||||
// Test the data read into the first row
|
||||
auto &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
|
||||
@@ -43,7 +127,7 @@ TEST(SparsePageDMatrix, RowAccess) {
|
||||
auto first_row = page[0];
|
||||
ASSERT_EQ(first_row.size(), 3ul);
|
||||
EXPECT_EQ(first_row[2].index, 2u);
|
||||
EXPECT_EQ(first_row[2].fvalue, 20);
|
||||
EXPECT_NEAR(first_row[2].fvalue, 0.986566, 1e-4);
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, ColAccess) {
|
||||
@@ -54,55 +138,46 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
size_t iter = 0;
|
||||
for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
|
||||
auto col_page = col_batch.GetView();
|
||||
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
|
||||
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_page[1].size(), 1);
|
||||
ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
|
||||
if (iter == 1) {
|
||||
ASSERT_EQ(col_page[0][0].fvalue, 0.f);
|
||||
ASSERT_EQ(col_page[3][0].fvalue, 30.f);
|
||||
ASSERT_EQ(col_page[3][0].index, 1);
|
||||
ASSERT_EQ(col_page[3].size(), 1);
|
||||
} else {
|
||||
ASSERT_EQ(col_page[1][0].fvalue, 10.0f);
|
||||
ASSERT_EQ(col_page[1].size(), 1);
|
||||
}
|
||||
CHECK_LE(col_batch.base_rowid, dmat->Info().num_row_);
|
||||
++iter;
|
||||
}
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
iter = 0;
|
||||
for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||
auto col_page = col_batch.GetView();
|
||||
EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
|
||||
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_page[1].size(), 1);
|
||||
if (iter == 0) {
|
||||
EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
|
||||
EXPECT_EQ(col_page[1].size(), 1);
|
||||
} else {
|
||||
EXPECT_EQ(col_page[3][0].fvalue, 30.f);
|
||||
EXPECT_EQ(col_page[3].size(), 1);
|
||||
}
|
||||
iter++;
|
||||
}
|
||||
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
||||
|
||||
delete dmat;
|
||||
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache"));
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.row.page"));
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.col.page"));
|
||||
EXPECT_FALSE(FileExists(tmp_file + ".cache.sorted.col.page"));
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, ExistingCacheFile) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
||||
EXPECT_ANY_THROW({
|
||||
std::unique_ptr<xgboost::DMatrix> dmat2 =
|
||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
||||
});
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/test";
|
||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
size_t constexpr kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
|
||||
|
||||
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
||||
xgboost::CreateSparsePageDMatrix(kEntries);
|
||||
|
||||
int threads = 1000;
|
||||
|
||||
@@ -134,13 +209,10 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||
|
||||
// Multi-batches access
|
||||
TEST(SparsePageDMatrix, ColAccessBatches) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
// Create multiple sparse pages
|
||||
std::unique_ptr<xgboost::DMatrix> dmat{
|
||||
xgboost::CreateSparsePageDMatrix(kEntries, kPageSize, filename)};
|
||||
std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
|
||||
auto n_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(16);
|
||||
for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
|
||||
@@ -149,234 +221,37 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
|
||||
omp_set_num_threads(n_threads);
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, Empty) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
std::vector<float> data{};
|
||||
std::vector<unsigned> feature_idx = {};
|
||||
std::vector<size_t> row_ptr = {};
|
||||
|
||||
{
|
||||
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(),
|
||||
data.data(), 0, 0, 0);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
data::DenseAdapter dense_adapter(nullptr, 0, 0);
|
||||
data::SparsePageDMatrix dmat2(
|
||||
&dense_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat2.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat2.Info().num_col_, 0);
|
||||
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
{
|
||||
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
|
||||
data::SparsePageDMatrix dmat3(
|
||||
&csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
|
||||
EXPECT_EQ(dmat3.Info().num_row_, 0);
|
||||
EXPECT_EQ(dmat3.Info().num_col_, 0);
|
||||
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
|
||||
EXPECT_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, MissingData) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
std::vector<float> data{0.0, std::nanf(""), 1.0};
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0};
|
||||
std::vector<size_t> row_ptr = {0, 2, 3};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
|
||||
3, 2);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
|
||||
|
||||
const std::string tmp_file2 = tempdir.path + "/simple2.libsvm";
|
||||
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1, tmp_file2);
|
||||
EXPECT_EQ(dmat2.Info().num_nonzero_, 1);
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, EmptyRow) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
std::vector<float> data{0.0, 1.0};
|
||||
std::vector<unsigned> feature_idx = {0, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 2};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
|
||||
2, 2);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, FromDense) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
int m = 3;
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5, 6};
|
||||
data::DenseAdapter adapter(data.data(), m, n);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
auto page = batch.GetView();
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = page[i];
|
||||
for (auto j = 0ull; j < inst.size(); j++) {
|
||||
EXPECT_EQ(inst[j].fvalue, data[i * n + j]);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, FromCSC) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
std::vector<float> data = {1, 3, 2, 4, 5};
|
||||
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
|
||||
std::vector<size_t> col_ptr = {0, 2, 5};
|
||||
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
|
||||
|
||||
auto &batch = *dmat.GetBatches<SparsePage>().begin();
|
||||
auto page = batch.GetView();
|
||||
auto inst = page[0];
|
||||
EXPECT_EQ(inst[0].fvalue, 1);
|
||||
EXPECT_EQ(inst[0].index, 0);
|
||||
EXPECT_EQ(inst[1].fvalue, 2);
|
||||
EXPECT_EQ(inst[1].index, 1);
|
||||
|
||||
inst = page[1];
|
||||
EXPECT_EQ(inst[0].fvalue, 3);
|
||||
EXPECT_EQ(inst[0].index, 0);
|
||||
EXPECT_EQ(inst[1].fvalue, 4);
|
||||
EXPECT_EQ(inst[1].index, 1);
|
||||
|
||||
inst = page[2];
|
||||
EXPECT_EQ(inst[0].fvalue, 5);
|
||||
EXPECT_EQ(inst[0].index, 1);
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, FromFile) {
|
||||
std::string filename = "test.libsvm";
|
||||
CreateBigTestData(filename, 20);
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1);
|
||||
ASSERT_EQ(dmat.Info().num_col_, 5);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
std::vector<bst_row_t> expected_offset(batch.Size() + 1);
|
||||
auto page = batch.GetView();
|
||||
int n = -3;
|
||||
std::generate(expected_offset.begin(), expected_offset.end(),
|
||||
[&n] { return n += 3; });
|
||||
EXPECT_EQ(batch.offset.HostVector(), expected_offset);
|
||||
|
||||
if (batch.base_rowid % 2 == 0) {
|
||||
EXPECT_EQ(page[0][0].index, 0);
|
||||
EXPECT_EQ(page[0][1].index, 1);
|
||||
EXPECT_EQ(page[0][2].index, 2);
|
||||
} else {
|
||||
EXPECT_EQ(page[0][0].index, 0);
|
||||
EXPECT_EQ(page[0][1].index, 3);
|
||||
EXPECT_EQ(page[0][2].index, 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, Large) {
|
||||
std::string filename = "test.libsvm";
|
||||
CreateBigTestData(filename, 1 << 16);
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
|
||||
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 16)};
|
||||
std::unique_ptr<DMatrix> simple{DMatrix::Load(filename, true, true)};
|
||||
|
||||
std::vector<float> sparse_data;
|
||||
std::vector<size_t> sparse_rptr;
|
||||
std::vector<bst_feature_t> sparse_cids;
|
||||
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
|
||||
|
||||
std::vector<float> simple_data;
|
||||
std::vector<size_t> simple_rptr;
|
||||
std::vector<bst_feature_t> simple_cids;
|
||||
DMatrixToCSR(simple.get(), &simple_data, &simple_rptr, &simple_cids);
|
||||
|
||||
ASSERT_EQ(sparse_rptr.size(), sparse->Info().num_row_ + 1);
|
||||
ASSERT_EQ(sparse_rptr.size(), simple->Info().num_row_ + 1);
|
||||
|
||||
ASSERT_EQ(sparse_data.size(), simple_data.size());
|
||||
ASSERT_EQ(sparse_data, simple_data);
|
||||
ASSERT_EQ(sparse_rptr.size(), simple_rptr.size());
|
||||
ASSERT_EQ(sparse_rptr, simple_rptr);
|
||||
ASSERT_EQ(sparse_cids, simple_cids);
|
||||
}
|
||||
|
||||
auto TestSparsePageDMatrixDeterminism(int32_t threads, std::string const& filename) {
|
||||
auto TestSparsePageDMatrixDeterminism(int32_t threads) {
|
||||
omp_set_num_threads(threads);
|
||||
std::vector<float> sparse_data;
|
||||
std::vector<size_t> sparse_rptr;
|
||||
std::vector<bst_feature_t> sparse_cids;
|
||||
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1 << 8)};
|
||||
std::string filename = tempdir.path + "/simple.libsvm";
|
||||
CreateBigTestData(filename, 1 << 16);
|
||||
|
||||
data::FileIterator iter(filename, 0, 1, "auto");
|
||||
std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
|
||||
&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1, filename}};
|
||||
|
||||
DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
|
||||
|
||||
std::string cache_name = tmp_file + ".row.page";
|
||||
auto cache_name =
|
||||
data::MakeId(filename,
|
||||
dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) +
|
||||
".row.page";
|
||||
std::string cache = common::LoadSequentialFile(cache_name);
|
||||
return cache;
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, Determinism) {
|
||||
std::string filename = "test.libsvm";
|
||||
CreateBigTestData(filename, 1 << 16);
|
||||
#if defined(_MSC_VER)
|
||||
return;
|
||||
#endif // defined(_MSC_VER)
|
||||
std::vector<std::string> caches;
|
||||
for (size_t i = 1; i < 18; i += 2) {
|
||||
caches.emplace_back(TestSparsePageDMatrixDeterminism(i, filename));
|
||||
caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < caches.size(); ++i) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/common/compressed_iterator.h"
|
||||
#include "../../../src/data/ellpack_page.cuh"
|
||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -14,13 +15,22 @@ TEST(SparsePageDMatrix, EllpackPage) {
|
||||
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 64})) {
|
||||
EXPECT_EQ(batch.Size(), dmat->Info().num_row_);
|
||||
size_t n = 0;
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
|
||||
n += batch.Size();
|
||||
}
|
||||
EXPECT_EQ(n, dmat->Info().num_row_);
|
||||
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page"));
|
||||
auto path =
|
||||
data::MakeId(tmp_file + ".cache",
|
||||
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
|
||||
".row.page";
|
||||
EXPECT_TRUE(FileExists(path));
|
||||
path =
|
||||
data::MakeId(tmp_file + ".cache",
|
||||
dynamic_cast<data::SparsePageDMatrix *>(dmat)) +
|
||||
".ellpack.page";
|
||||
EXPECT_TRUE(FileExists(path));
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
@@ -30,12 +40,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, filename);
|
||||
|
||||
// Loop over the batches and count the records
|
||||
int64_t batch_count = 0;
|
||||
int64_t row_count = 0;
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 7UL})) {
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
|
||||
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
|
||||
batch_count++;
|
||||
row_count += batch.Size();
|
||||
@@ -43,7 +53,36 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
|
||||
EXPECT_GE(batch_count, 2);
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
|
||||
EXPECT_TRUE(FileExists(filename + ".cache.ellpack.page"));
|
||||
auto path =
|
||||
data::MakeId(filename,
|
||||
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
|
||||
".ellpack.page";
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RetainEllpackPage) {
|
||||
auto m = CreateSparsePageDMatrix(10000);
|
||||
auto batches = m->GetBatches<EllpackPage>({0, 32});
|
||||
auto begin = batches.begin();
|
||||
auto end = batches.end();
|
||||
|
||||
std::vector<HostDeviceVector<common::CompressedByteT>> gidx_buffers;
|
||||
std::vector<std::shared_ptr<EllpackPage const>> iterators;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
iterators.push_back(it.Page());
|
||||
gidx_buffers.emplace_back(HostDeviceVector<common::CompressedByteT>{});
|
||||
gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
|
||||
gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
|
||||
}
|
||||
ASSERT_GE(iterators.size(), 2);
|
||||
|
||||
for (size_t i = 0; i < iterators.size(); ++i) {
|
||||
ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
|
||||
}
|
||||
|
||||
// make sure it's const and the caller can not modify the content of page.
|
||||
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
|
||||
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, EllpackPageContent) {
|
||||
@@ -59,7 +98,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
|
||||
BatchParam param{0, 2, 0};
|
||||
BatchParam param{0, 2};
|
||||
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
EXPECT_EQ(impl->base_rowid, 0);
|
||||
EXPECT_EQ(impl->n_rows, kRows);
|
||||
@@ -67,7 +106,17 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
|
||||
EXPECT_EQ(impl->row_stride, 2);
|
||||
EXPECT_EQ(impl->Cuts().TotalBins(), 4);
|
||||
|
||||
auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
std::unique_ptr<EllpackPageImpl> impl_ext;
|
||||
size_t offset = 0;
|
||||
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
|
||||
if (!impl_ext) {
|
||||
impl_ext.reset(new EllpackPageImpl(
|
||||
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
|
||||
batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
|
||||
}
|
||||
auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
|
||||
offset += n_elems;
|
||||
}
|
||||
EXPECT_EQ(impl_ext->base_rowid, 0);
|
||||
EXPECT_EQ(impl_ext->n_rows, kRows);
|
||||
EXPECT_FALSE(impl_ext->is_dense);
|
||||
@@ -109,7 +158,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
|
||||
BatchParam param{0, kMaxBins, kPageSize};
|
||||
BatchParam param{0, kMaxBins};
|
||||
auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
EXPECT_EQ(impl->base_rowid, 0);
|
||||
EXPECT_EQ(impl->n_rows, kRows);
|
||||
@@ -150,7 +199,7 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
|
||||
std::unique_ptr<DMatrix>
|
||||
dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
|
||||
|
||||
BatchParam param{0, kMaxBins, kPageSize};
|
||||
BatchParam param{0, kMaxBins};
|
||||
|
||||
size_t current_row = 0;
|
||||
for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
|
||||
|
||||
@@ -155,7 +155,8 @@ TEST(GBTree, ChoosePredictor) {
|
||||
ASSERT_TRUE(data.HostCanWrite());
|
||||
|
||||
// pull data into device.
|
||||
data = HostDeviceVector<Entry>(data.HostVector(), 0);
|
||||
data.HostVector();
|
||||
data.SetDevice(0);
|
||||
data.DeviceSpan();
|
||||
ASSERT_FALSE(data.HostCanWrite());
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "xgboost/c_api.h"
|
||||
#include "../../src/data/adapter.h"
|
||||
#include "../../src/data/simple_dmatrix.h"
|
||||
#include "../../src/data/sparse_page_dmatrix.h"
|
||||
#include "../../src/gbm/gbtree_model.h"
|
||||
#include "xgboost/predictor.h"
|
||||
|
||||
@@ -45,12 +46,25 @@ void CreateSimpleTestData(const std::string& filename) {
|
||||
CreateBigTestData(filename, 6);
|
||||
}
|
||||
|
||||
void CreateBigTestData(const std::string& filename, size_t n_entries) {
|
||||
void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based) {
|
||||
std::ofstream fo(filename.c_str());
|
||||
const size_t entries_per_row = 3;
|
||||
std::string odd_row;
|
||||
if (zero_based) {
|
||||
odd_row = " 0:0 3:30 4:40\n";
|
||||
} else {
|
||||
odd_row = " 1:0 4:30 5:40\n";
|
||||
}
|
||||
std::string even_row;
|
||||
if (zero_based) {
|
||||
even_row = " 0:0 1:10 2:20\n";
|
||||
} else {
|
||||
even_row = " 1:0 2:10 3:20\n";
|
||||
}
|
||||
|
||||
size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row;
|
||||
for (size_t i = 0; i < n_rows; ++i) {
|
||||
const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n";
|
||||
auto row = i % 2 == 0 ? even_row : odd_row;
|
||||
fo << i << row;
|
||||
}
|
||||
}
|
||||
@@ -348,13 +362,20 @@ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1));
|
||||
}
|
||||
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||
size_t n_entries, size_t page_size, std::string tmp_file) {
|
||||
// Create sufficiently large data to make two row pages
|
||||
CreateBigTestData(tmp_file, n_entries);
|
||||
std::unique_ptr<DMatrix> dmat { DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries,
|
||||
std::string prefix) {
|
||||
size_t n_columns = 3;
|
||||
size_t n_rows = n_entries / n_columns;
|
||||
ArrayIterForTest iter(0, n_rows, n_columns, 2);
|
||||
|
||||
std::unique_ptr<DMatrix> dmat{DMatrix::Create(
|
||||
static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1, prefix)};
|
||||
auto row_page_path =
|
||||
data::MakeId(prefix,
|
||||
dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) +
|
||||
".row.page";
|
||||
EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
|
||||
|
||||
// Loop over the batches and count the records
|
||||
int64_t batch_count = 0;
|
||||
@@ -368,7 +389,6 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||
return dmat;
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
||||
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
|
||||
const dmlc::TemporaryDirectory& tempdir) {
|
||||
@@ -432,7 +452,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
||||
uri += "#" + tmp_file + ".cache";
|
||||
}
|
||||
std::unique_ptr<DMatrix> dmat(
|
||||
DMatrix::Load(uri, true, false, "auto", page_size));
|
||||
DMatrix::Load(uri, true, false, "auto"));
|
||||
return dmat;
|
||||
}
|
||||
|
||||
@@ -481,6 +501,28 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
|
||||
return gbm;
|
||||
}
|
||||
|
||||
ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols,
|
||||
size_t batches) : rows_{rows}, cols_{cols}, n_batches_{batches} {
|
||||
XGProxyDMatrixCreate(&proxy_);
|
||||
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
|
||||
std::tie(batches_, interface_) =
|
||||
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
||||
}
|
||||
|
||||
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
|
||||
|
||||
int ArrayIterForTest::Next() {
|
||||
if (iter_ == n_batches_) {
|
||||
return 0;
|
||||
}
|
||||
XGProxyDMatrixSetDataDense(proxy_, batches_[iter_].c_str());
|
||||
iter_++;
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t constexpr ArrayIterForTest::kRows;
|
||||
size_t constexpr ArrayIterForTest::kCols;
|
||||
|
||||
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
||||
std::vector<size_t> *p_row_ptr,
|
||||
std::vector<bst_feature_t> *p_cids) {
|
||||
|
||||
@@ -8,16 +8,16 @@ namespace xgboost {
|
||||
|
||||
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
|
||||
size_t cols, size_t batches)
|
||||
: rows_{rows}, cols_{cols}, n_batches_{batches} {
|
||||
XGProxyDMatrixCreate(&proxy_);
|
||||
rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
|
||||
: ArrayIterForTest{sparsity, rows, cols, batches} {
|
||||
rng_->Device(0);
|
||||
std::tie(batches_, interface_) =
|
||||
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
||||
this->Reset();
|
||||
}
|
||||
|
||||
CudaArrayIterForTest::~CudaArrayIterForTest() { XGDMatrixFree(proxy_); }
|
||||
size_t constexpr CudaArrayIterForTest::kRows;
|
||||
size_t constexpr CudaArrayIterForTest::kCols;
|
||||
size_t constexpr CudaArrayIterForTest::kBatches;
|
||||
|
||||
int CudaArrayIterForTest::Next() {
|
||||
if (iter_ == n_batches_) {
|
||||
@@ -28,8 +28,6 @@ int CudaArrayIterForTest::Next() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t constexpr CudaArrayIterForTest::kRows;
|
||||
size_t constexpr CudaArrayIterForTest::kCols;
|
||||
|
||||
std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label,
|
||||
bool float_label,
|
||||
|
||||
@@ -55,7 +55,9 @@ int64_t GetFileSize(const std::string& filename);
|
||||
|
||||
void CreateSimpleTestData(const std::string& filename);
|
||||
|
||||
void CreateBigTestData(const std::string& filename, size_t n_entries);
|
||||
// Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
|
||||
// 0-based indexing.
|
||||
void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
|
||||
|
||||
void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
||||
std::vector<xgboost::bst_float> preds,
|
||||
@@ -300,8 +302,7 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
|
||||
std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
|
||||
int num_rows, int num_columns);
|
||||
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||
size_t n_entries, size_t page_size, std::string tmp_file);
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");
|
||||
|
||||
/**
|
||||
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
||||
@@ -356,7 +357,8 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
|
||||
|
||||
typedef void *DMatrixHandle; // NOLINT(*);
|
||||
|
||||
class CudaArrayIterForTest {
|
||||
class ArrayIterForTest {
|
||||
protected:
|
||||
HostDeviceVector<float> data_;
|
||||
size_t iter_ {0};
|
||||
DMatrixHandle proxy_;
|
||||
@@ -373,20 +375,32 @@ class CudaArrayIterForTest {
|
||||
size_t static constexpr kBatches { 100 };
|
||||
size_t static constexpr kCols { 13 };
|
||||
|
||||
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
|
||||
size_t cols = kCols, size_t batches = kBatches);
|
||||
~CudaArrayIterForTest();
|
||||
|
||||
std::string AsArray() const {
|
||||
return interface_;
|
||||
}
|
||||
|
||||
int Next();
|
||||
void Reset() {
|
||||
virtual int Next();
|
||||
virtual void Reset() {
|
||||
iter_ = 0;
|
||||
}
|
||||
size_t Iter() const { return iter_; }
|
||||
auto Proxy() -> decltype(proxy_) { return proxy_; }
|
||||
|
||||
explicit ArrayIterForTest(float sparsity, size_t rows = kRows,
|
||||
size_t cols = kCols, size_t batches = kBatches);
|
||||
virtual ~ArrayIterForTest();
|
||||
};
|
||||
|
||||
class CudaArrayIterForTest : public ArrayIterForTest {
|
||||
public:
|
||||
size_t static constexpr kRows{1000};
|
||||
size_t static constexpr kBatches{100};
|
||||
size_t static constexpr kCols{13};
|
||||
|
||||
explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
|
||||
size_t cols = kCols, size_t batches = kBatches);
|
||||
int Next() override;
|
||||
~CudaArrayIterForTest() override = default;
|
||||
};
|
||||
|
||||
void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
||||
@@ -396,11 +410,11 @@ void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
|
||||
typedef void *DataIterHandle; // NOLINT(*)
|
||||
|
||||
inline void Reset(DataIterHandle self) {
|
||||
static_cast<CudaArrayIterForTest*>(self)->Reset();
|
||||
static_cast<ArrayIterForTest*>(self)->Reset();
|
||||
}
|
||||
|
||||
inline int Next(DataIterHandle self) {
|
||||
return static_cast<CudaArrayIterForTest*>(self)->Next();
|
||||
return static_cast<ArrayIterForTest*>(self)->Next();
|
||||
}
|
||||
|
||||
class RMMAllocator;
|
||||
|
||||
@@ -92,13 +92,10 @@ TEST(CpuPredictor, IterationRange) {
|
||||
}
|
||||
|
||||
TEST(CpuPredictor, ExternalMemory) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
|
||||
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
|
||||
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
|
||||
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries, kPageSize, filename);
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
|
||||
auto lparam = CreateEmptyGenericParam(GPUIDX);
|
||||
|
||||
std::unique_ptr<Predictor> cpu_predictor =
|
||||
|
||||
@@ -102,13 +102,10 @@ TEST(GPUPredictor, ExternalMemoryTest) {
|
||||
|
||||
gbm::GBTreeModel model = CreateTestModel(¶m, n_classes);
|
||||
std::vector<std::unique_ptr<DMatrix>> dmats;
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string file0 = tmpdir.path + "/big_0.libsvm";
|
||||
std::string file1 = tmpdir.path + "/big_1.libsvm";
|
||||
std::string file2 = tmpdir.path + "/big_2.libsvm";
|
||||
dmats.push_back(CreateSparsePageDMatrix(400, 64UL, file0));
|
||||
dmats.push_back(CreateSparsePageDMatrix(800, 128UL, file1));
|
||||
dmats.push_back(CreateSparsePageDMatrix(8000, 1024UL, file2));
|
||||
|
||||
dmats.push_back(CreateSparsePageDMatrix(400));
|
||||
dmats.push_back(CreateSparsePageDMatrix(800));
|
||||
dmats.push_back(CreateSparsePageDMatrix(8000));
|
||||
|
||||
for (const auto& dmat: dmats) {
|
||||
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
|
||||
|
||||
@@ -98,8 +98,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT
|
||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
||||
CreateBigTestData(tmp_file, 50000);
|
||||
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 100));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto"));
|
||||
EXPECT_FALSE(dmat->SingleColBlock());
|
||||
size_t num_row = dmat->Info().num_row_;
|
||||
std::vector<bst_float> labels(num_row);
|
||||
|
||||
@@ -27,7 +27,7 @@ void VerifySampling(size_t page_size,
|
||||
}
|
||||
gpair.SetDevice(0);
|
||||
|
||||
BatchParam param{0, 256, page_size};
|
||||
BatchParam param{0, 256};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
if (page_size != 0) {
|
||||
EXPECT_NE(page->n_rows, kRows);
|
||||
@@ -82,7 +82,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
|
||||
auto gpair = GenerateRandomGradients(kRows);
|
||||
gpair.SetDevice(0);
|
||||
|
||||
BatchParam param{0, 256, kPageSize};
|
||||
BatchParam param{0, 256};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
|
||||
EXPECT_NE(page->n_rows, kRows);
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
|
||||
|
||||
float sparsity = is_dense ? 0.0f : 0.5f;
|
||||
auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix();
|
||||
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0};
|
||||
BatchParam batch_param{0, static_cast<int32_t>(kBins)};
|
||||
|
||||
for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) {
|
||||
auto* page = batch.Impl();
|
||||
@@ -116,7 +116,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
|
||||
auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
|
||||
auto cat_m = GetDMatrixFromData(x, kRows, 1);
|
||||
cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||
BatchParam batch_param{0, static_cast<int32_t>(kBins), 0};
|
||||
BatchParam batch_param{0, static_cast<int32_t>(kBins)};
|
||||
tree::RowPartitioner row_partitioner(0, kRows);
|
||||
auto ridx = row_partitioner.GetRows(0);
|
||||
dh::device_vector<GradientPairPrecise> cat_hist(num_categories);
|
||||
|
||||
@@ -152,7 +152,6 @@ TEST(GpuHist, ApplySplit) {
|
||||
BatchParam bparam;
|
||||
bparam.gpu_id = 0;
|
||||
bparam.max_bin = 3;
|
||||
bparam.gpu_page_size = 0;
|
||||
|
||||
for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
|
||||
auto impl = ellpack.Impl();
|
||||
@@ -291,9 +290,13 @@ void TestHistogramIndexImpl() {
|
||||
// Extract the device maker from the histogram makers and from that its compressed
|
||||
// histogram index
|
||||
const auto &maker = hist_maker.maker;
|
||||
auto grad = GenerateRandomGradients(kNRows);
|
||||
grad.SetDevice(0);
|
||||
maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
|
||||
|
||||
const auto &maker_ext = hist_maker_ext.maker;
|
||||
maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols);
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
|
||||
|
||||
ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins());
|
||||
@@ -365,7 +368,7 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
// Loop over the batches and count the records
|
||||
int64_t batch_count = 0;
|
||||
int64_t row_count = 0;
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin, gpu_page_size})) {
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin})) {
|
||||
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
|
||||
batch_count++;
|
||||
row_count += batch.Size();
|
||||
@@ -386,7 +389,6 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
|
||||
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker;
|
||||
GenericParameter generic_param(CreateEmptyGenericParam(0));
|
||||
generic_param.gpu_page_size = gpu_page_size;
|
||||
hist_maker.Configure(args, &generic_param);
|
||||
|
||||
hist_maker.Update(gpair, dmat, {tree});
|
||||
|
||||
Reference in New Issue
Block a user