Use adapters for SparsePageDMatrix (#5092)

This commit is contained in:
Rory Mitchell
2019-12-11 15:59:23 +13:00
committed by GitHub
parent e089e16e3d
commit c7cc657a4d
12 changed files with 437 additions and 253 deletions

View File

@@ -1,8 +1,6 @@
// Copyright (c) 2019 by Contributors
#include <gtest/gtest.h>
#include <xgboost/c_api.h>
#include <xgboost/data.h>
#include <xgboost/version_config.h>
#include "../../../src/data/adapter.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/common/timer.h"
@@ -29,71 +27,6 @@ TEST(c_api, CSRAdapter) {
EXPECT_EQ(line2 .GetElement(0).value, 5);
EXPECT_EQ(line2 .GetElement(0).row_idx, 2);
EXPECT_EQ(line2 .GetElement(0).column_idx, 1);
data::SimpleDMatrix dmat(&adapter, std::nan(""), -1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = batch[i];
for(auto j = 0ull; j < inst.size(); j++)
{
EXPECT_EQ(inst[j].fvalue, data[row_ptr[i] + j]);
EXPECT_EQ(inst[j].index, feature_idx[row_ptr[i] + j]);
}
}
}
}
TEST(c_api, DenseAdapter) {
int m = 3;
int n = 2;
std::vector<float> data = {1, 2, 3, 4, 5, 6};
data::DenseAdapter adapter(data.data(), m, m*n, n);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = batch[i];
for(auto j = 0ull; j < inst.size(); j++)
{
EXPECT_EQ(inst[j].fvalue, data[i*n+j]);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(c_api, CSCAdapter) {
std::vector<float> data = {1, 3, 2, 4, 5};
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
std::vector<size_t> col_ptr = {0, 2, 5};
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
auto &batch = *dmat.GetBatches<SparsePage>().begin();
auto inst = batch[0];
EXPECT_EQ(inst[0].fvalue, 1);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 2);
EXPECT_EQ(inst[1].index, 1);
inst = batch[1];
EXPECT_EQ(inst[0].fvalue, 3);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 4);
EXPECT_EQ(inst[1].index, 1);
inst = batch[2];
EXPECT_EQ(inst[0].fvalue, 5);
EXPECT_EQ(inst[0].index, 1);
}
TEST(c_api, CSCAdapterColsMoreThanRows) {
@@ -128,10 +61,3 @@ TEST(c_api, CSCAdapterColsMoreThanRows) {
EXPECT_EQ(inst[3].fvalue, 8);
EXPECT_EQ(inst[3].index, 3);
}
TEST(c_api, FileAdapter) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 10);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1,"auto"));
data::FileAdapter adapter(parser.get());
}

View File

@@ -1,10 +1,10 @@
// Copyright by Contributors
#include <xgboost/data.h>
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include "../../../src/data/simple_dmatrix.h"
#include "../helpers.h"
#include "../../../src/data/adapter.h"
#include "../helpers.h"
using namespace xgboost; // NOLINT
@@ -12,7 +12,7 @@ TEST(SimpleDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, true, false);
// Test the metadata that was parsed
EXPECT_EQ(dmat->Info().num_row_, 2);
@@ -27,7 +27,7 @@ TEST(SimpleDMatrix, RowAccess) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false, false);
// Loop over the batches and count the records
int64_t row_count = 0;
@@ -49,7 +49,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, true, false);
// Sorted column access
EXPECT_EQ(dmat->GetColDensity(0), 1);
@@ -72,7 +72,8 @@ TEST(SimpleDMatrix, Empty) {
std::vector<unsigned> feature_idx = {};
std::vector<size_t> row_ptr = {};
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(), 0, 0, 0);
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(),
0, 0, 0);
data::SimpleDMatrix dmat(&csr_adapter,
std::numeric_limits<float>::quiet_NaN(), 1);
CHECK_EQ(dmat.Info().num_nonzero_, 0);
@@ -108,8 +109,10 @@ TEST(SimpleDMatrix, MissingData) {
std::vector<unsigned> feature_idx = {0, 1, 0};
std::vector<size_t> row_ptr = {0, 2, 3};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 3, 2);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
3, 2);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
1);
CHECK_EQ(dmat.Info().num_nonzero_, 2);
dmat = data::SimpleDMatrix(&adapter, 1.0, 1);
CHECK_EQ(dmat.Info().num_nonzero_, 1);
@@ -120,34 +123,86 @@ TEST(SimpleDMatrix, EmptyRow) {
std::vector<unsigned> feature_idx = {0, 1};
std::vector<size_t> row_ptr = {0, 2, 2};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 2, 2);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
2, 2);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
1);
CHECK_EQ(dmat.Info().num_nonzero_, 2);
CHECK_EQ(dmat.Info().num_row_, 2);
CHECK_EQ(dmat.Info().num_col_, 2);
}
TEST(SimpleDMatrix, FromDense) {
int m = 3;
int n = 2;
std::vector<float> data = {1, 2, 3, 4, 5, 6};
data::DenseAdapter adapter(data.data(), m, m * n, n);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = batch[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, data[i * n + j]);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SimpleDMatrix, FromCSC) {
std::vector<float> data = {1, 3, 2, 4, 5};
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
std::vector<size_t> col_ptr = {0, 2, 5};
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-1);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
auto &batch = *dmat.GetBatches<SparsePage>().begin();
auto inst = batch[0];
EXPECT_EQ(inst[0].fvalue, 1);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 2);
EXPECT_EQ(inst[1].index, 1);
inst = batch[1];
EXPECT_EQ(inst[0].fvalue, 3);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 4);
EXPECT_EQ(inst[1].index, 1);
inst = batch[2];
EXPECT_EQ(inst[0].fvalue, 5);
EXPECT_EQ(inst[0].index, 1);
}
TEST(SimpleDMatrix, FromFile) {
std::string filename = "test.libsvm";
CreateBigTestData(filename, 3 * 5);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
1);
1);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 5);
EXPECT_EQ(batch.offset.HostVector(),
std::vector<bst_row_t>({0, 3, 6, 9, 12, 15}));
std::vector<bst_row_t>({0, 3, 6, 9, 12, 15}));
EXPECT_EQ(batch.base_rowid, 0);
for (auto i = 0ull; i < batch.Size(); i++) {
if (i%2== 0) {
if (i % 2 == 0) {
EXPECT_EQ(batch[i][0].index, 0);
EXPECT_EQ(batch[i][1].index, 1);
EXPECT_EQ(batch[i][2].index, 2);
}
else {
} else {
EXPECT_EQ(batch[i][0].index, 0);
EXPECT_EQ(batch[i][1].index, 3);
EXPECT_EQ(batch[i][2].index, 4);

View File

@@ -1,12 +1,12 @@
// Copyright by Contributors
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include <dmlc/filesystem.h>
#include <cinttypes>
#include "../../../src/data/sparse_page_dmatrix.h"
#include "../../../src/data/adapter.h"
#include "../helpers.h"
#include <gtest/gtest.h>
using namespace xgboost; // NOLINT
TEST(SparsePageDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir;
@@ -87,3 +87,158 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
}
omp_set_num_threads(n_threads);
}
TEST(SparsePageDMatrix, Empty) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{};
std::vector<unsigned> feature_idx = {};
std::vector<size_t> row_ptr = {};
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(), 0, 0, 0);
data::SparsePageDMatrix dmat(&csr_adapter,
std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 0);
EXPECT_EQ(dmat.Info().num_row_, 0);
EXPECT_EQ(dmat.Info().num_col_, 0);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
data::DenseAdapter dense_adapter(nullptr, 0, 0, 0);
data::SparsePageDMatrix dmat2(&dense_adapter,
std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
EXPECT_EQ(dmat2.Info().num_nonzero_, 0);
EXPECT_EQ(dmat2.Info().num_row_, 0);
EXPECT_EQ(dmat2.Info().num_col_, 0);
for (auto &batch : dmat2.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
data::SparsePageDMatrix dmat3(&csc_adapter,
std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
EXPECT_EQ(dmat3.Info().num_nonzero_, 0);
EXPECT_EQ(dmat3.Info().num_row_, 0);
EXPECT_EQ(dmat3.Info().num_col_, 0);
for (auto &batch : dmat3.GetBatches<SparsePage>()) {
EXPECT_EQ(batch.Size(), 0);
}
}
TEST(SparsePageDMatrix, MissingData) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{0.0, std::nanf(""), 1.0};
std::vector<unsigned> feature_idx = {0, 1, 0};
std::vector<size_t> row_ptr = {0, 2, 3};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 3, 2);
data::SparsePageDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
const std::string tmp_file2 = tempdir.path + "/simple2.libsvm";
data::SparsePageDMatrix dmat2(&adapter, 1.0, 1,tmp_file2);
EXPECT_EQ(dmat2.Info().num_nonzero_, 1);
}
TEST(SparsePageDMatrix, EmptyRow) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data{0.0, 1.0};
std::vector<unsigned> feature_idx = {0, 1};
std::vector<size_t> row_ptr = {0, 2, 2};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 2, 2);
data::SparsePageDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1,tmp_file);
EXPECT_EQ(dmat.Info().num_nonzero_, 2);
EXPECT_EQ(dmat.Info().num_row_, 2);
EXPECT_EQ(dmat.Info().num_col_, 2);
}
TEST(SparsePageDMatrix, FromDense) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
int m = 3;
int n = 2;
std::vector<float> data = {1, 2, 3, 4, 5, 6};
data::DenseAdapter adapter(data.data(), m, m * n, n);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, tmp_file);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = batch[i];
for(auto j = 0ull; j < inst.size(); j++)
{
EXPECT_EQ(inst[j].fvalue, data[i*n+j]);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SparsePageDMatrix, FromCSC) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
std::vector<float> data = {1, 3, 2, 4, 5};
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
std::vector<size_t> col_ptr = {0, 2, 5};
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file);
EXPECT_EQ(dmat.Info().num_col_, 2);
EXPECT_EQ(dmat.Info().num_row_, 3);
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
auto &batch = *dmat.GetBatches<SparsePage>().begin();
auto inst = batch[0];
EXPECT_EQ(inst[0].fvalue, 1);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 2);
EXPECT_EQ(inst[1].index, 1);
inst = batch[1];
EXPECT_EQ(inst[0].fvalue, 3);
EXPECT_EQ(inst[0].index, 0);
EXPECT_EQ(inst[1].fvalue, 4);
EXPECT_EQ(inst[1].index, 1);
inst = batch[2];
EXPECT_EQ(inst[0].fvalue, 5);
EXPECT_EQ(inst[0].index, 1);
}
TEST(SparsePageDMatrix, FromFile) {
std::string filename = "test.libsvm";
CreateBigTestData(filename,20);
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
data::FileAdapter adapter(parser.get());
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
data::SparsePageDMatrix dmat(
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
std::vector<bst_row_t> expected_offset(batch.Size() + 1);
int n = -3;
std::generate(expected_offset.begin(), expected_offset.end(),
[&n] { return n += 3; });
EXPECT_EQ(batch.offset.HostVector(), expected_offset);
if (batch.base_rowid % 2 == 0) {
EXPECT_EQ(batch[0][0].index, 0);
EXPECT_EQ(batch[0][1].index, 1);
EXPECT_EQ(batch[0][2].index, 2);
} else {
EXPECT_EQ(batch[0][0].index, 0);
EXPECT_EQ(batch[0][1].index, 3);
EXPECT_EQ(batch[0][2].index, 4);
}
}
}