External data adapters (#5044)
* Use external data adapters as lightweight intermediate layer between external data and DMatrix
This commit is contained in:
54
tests/cpp/common/test_group_data.cc
Normal file
54
tests/cpp/common/test_group_data.cc
Normal file
@@ -0,0 +1,54 @@
|
||||
/*!
|
||||
* Copyright 2019 by Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/common/group_data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
TEST(group_data, ParallelGroupBuilder) {
|
||||
std::vector<size_t> offsets;
|
||||
std::vector<Entry> data;
|
||||
ParallelGroupBuilder<Entry, size_t> builder(&offsets, &data);
|
||||
builder.InitBudget(0, 1);
|
||||
// Add two rows with two elements each
|
||||
builder.AddBudget(0, 0, 2);
|
||||
builder.AddBudget(1, 0, 2);
|
||||
|
||||
builder.InitStorage();
|
||||
builder.Push(0, Entry(0, 0), 0);
|
||||
builder.Push(0, Entry(1, 1), 0);
|
||||
builder.Push(1, Entry(0, 2), 0);
|
||||
builder.Push(1, Entry(1, 3), 0);
|
||||
|
||||
std::vector<Entry> expected_data{
|
||||
Entry(0, 0),
|
||||
Entry(1, 1),
|
||||
Entry(0, 2),
|
||||
Entry(1, 3),
|
||||
};
|
||||
std::vector<size_t> expected_offsets{0, 2, 4};
|
||||
|
||||
EXPECT_EQ(data, expected_data);
|
||||
EXPECT_EQ(offsets, expected_offsets);
|
||||
|
||||
// Create new builder, add one more row given already populated offsets/data
|
||||
ParallelGroupBuilder<Entry, size_t> builder2(&offsets, &data);
|
||||
builder2.InitBudget(0, 1);
|
||||
builder2.AddBudget(2, 0, 2);
|
||||
builder2.InitStorage();
|
||||
builder2.Push(2, Entry(0, 4), 0);
|
||||
builder2.Push(2, Entry(1, 5), 0);
|
||||
|
||||
expected_data.emplace_back(Entry(0, 4));
|
||||
expected_data.emplace_back(Entry(1, 5));
|
||||
expected_offsets.emplace_back(6);
|
||||
|
||||
EXPECT_EQ(data, expected_data);
|
||||
EXPECT_EQ(offsets, expected_offsets);
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
104
tests/cpp/data/test_adapter.cc
Normal file
104
tests/cpp/data/test_adapter.cc
Normal file
@@ -0,0 +1,104 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/c_api.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/version_config.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/common/timer.h"
|
||||
#include "../helpers.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
TEST(c_api, CSRAdapter) {
|
||||
int m = 3;
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5};
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0, 1, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 4, 5};
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(),
|
||||
row_ptr.size() - 1, data.size(), n);
|
||||
adapter.Next();
|
||||
auto & batch = adapter.Value();
|
||||
auto line0 = batch.GetLine(0);
|
||||
EXPECT_EQ(line0.GetElement(0).value, 1);
|
||||
EXPECT_EQ(line0.GetElement(1).value, 2);
|
||||
|
||||
auto line1 = batch.GetLine(1);
|
||||
EXPECT_EQ(line1 .GetElement(0).value, 3);
|
||||
EXPECT_EQ(line1 .GetElement(1).value, 4);
|
||||
auto line2 = batch.GetLine(2);
|
||||
EXPECT_EQ(line2 .GetElement(0).value, 5);
|
||||
EXPECT_EQ(line2 .GetElement(0).row_idx, 2);
|
||||
EXPECT_EQ(line2 .GetElement(0).column_idx, 1);
|
||||
|
||||
data::SimpleDMatrix dmat(&adapter, -1, std::nan(""));
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for(auto j = 0ull; j < inst.size(); j++)
|
||||
{
|
||||
EXPECT_EQ(inst[j].fvalue, data[row_ptr[i] + j]);
|
||||
EXPECT_EQ(inst[j].index, feature_idx[row_ptr[i] + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST(c_api, DenseAdapter) {
|
||||
int m = 3;
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5, 6};
|
||||
data::DenseAdapter adapter(data.data(), m, m*n, n);
|
||||
data::SimpleDMatrix dmat(&adapter,-1,std::numeric_limits<float>::quiet_NaN());
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for(auto j = 0ull; j < inst.size(); j++)
|
||||
{
|
||||
EXPECT_EQ(inst[j].fvalue, data[i*n+j]);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(c_api, CSCAdapter) {
|
||||
std::vector<float> data = {1, 3, 2, 4, 5};
|
||||
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
|
||||
std::vector<size_t> col_ptr = {0, 2, 5};
|
||||
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
|
||||
data::SimpleDMatrix dmat(&adapter,-1,std::numeric_limits<float>::quiet_NaN());
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
|
||||
|
||||
auto &batch = *dmat.GetBatches<SparsePage>().begin();
|
||||
auto inst = batch[0];
|
||||
EXPECT_EQ(inst[0].fvalue, 1);
|
||||
EXPECT_EQ(inst[0].index, 0);
|
||||
EXPECT_EQ(inst[1].fvalue, 2);
|
||||
EXPECT_EQ(inst[1].index, 1);
|
||||
|
||||
inst = batch[1];
|
||||
EXPECT_EQ(inst[0].fvalue, 3);
|
||||
EXPECT_EQ(inst[0].index, 0);
|
||||
EXPECT_EQ(inst[1].fvalue, 4);
|
||||
EXPECT_EQ(inst[1].index, 1);
|
||||
|
||||
inst = batch[2];
|
||||
EXPECT_EQ(inst[0].fvalue, 5);
|
||||
EXPECT_EQ(inst[0].index, 1);
|
||||
}
|
||||
|
||||
TEST(c_api, FileAdapter) {
|
||||
std::string filename = "test.libsvm";
|
||||
CreateBigTestData(filename, 10);
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1,"auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
}
|
||||
@@ -101,7 +101,6 @@ TEST(DMatrix, Uri) {
|
||||
std::string path = tmpdir.path + "/small.csv";
|
||||
|
||||
std::ofstream fout(path);
|
||||
ASSERT_TRUE(fout);
|
||||
size_t i = 0;
|
||||
for (size_t r = 0; r < kRows; ++r) {
|
||||
for (size_t c = 0; c < kCols; ++c) {
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/data/adapter.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
TEST(SimpleDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
@@ -63,3 +66,63 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
EXPECT_EQ(num_col_batch, 1) << "Expected number of batches to be 1";
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, Empty) {
|
||||
std::vector<float> data{};
|
||||
std::vector<unsigned> feature_idx = {};
|
||||
std::vector<size_t> row_ptr = {};
|
||||
|
||||
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(), 0, 0, 0);
|
||||
data::SimpleDMatrix dmat(&csr_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 0);
|
||||
CHECK_EQ(dmat.Info().num_row_, 0);
|
||||
CHECK_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
CHECK_EQ(batch.Size(), 0);
|
||||
}
|
||||
|
||||
data::DenseAdapter dense_adapter(nullptr, 0, 0, 0);
|
||||
dmat = data::SimpleDMatrix(&dense_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 0);
|
||||
CHECK_EQ(dmat.Info().num_row_, 0);
|
||||
CHECK_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
CHECK_EQ(batch.Size(), 0);
|
||||
}
|
||||
|
||||
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
|
||||
dmat = data::SimpleDMatrix(&csc_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 0);
|
||||
CHECK_EQ(dmat.Info().num_row_, 0);
|
||||
CHECK_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
CHECK_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, MissingData) {
|
||||
std::vector<float> data{0.0, std::nanf(""), 1.0};
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0};
|
||||
std::vector<size_t> row_ptr = {0, 2, 3};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 3, 2);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 2);
|
||||
dmat = data::SimpleDMatrix(&adapter, 1.0, 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 1);
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, EmptyRow) {
|
||||
std::vector<float> data{0.0, 1.0};
|
||||
std::vector<unsigned> feature_idx = {0, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 2};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 2, 2);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 2);
|
||||
CHECK_EQ(dmat.Info().num_row_, 2);
|
||||
CHECK_EQ(dmat.Info().num_col_, 2);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user