External data adapters (#5044)

* Use external data adapters as lightweight intermediate layer between external data and DMatrix
This commit is contained in:
Rory Mitchell
2019-12-04 10:56:17 +13:00
committed by GitHub
parent f2277e7106
commit e3c34c79be
15 changed files with 1058 additions and 593 deletions

View File

@@ -4,6 +4,9 @@
#include "../../../src/data/simple_dmatrix.h"
#include "../helpers.h"
#include "../../../src/data/adapter.h"
using namespace xgboost; // NOLINT
TEST(SimpleDMatrix, MetaInfo) {
dmlc::TemporaryDirectory tempdir;
@@ -63,3 +66,63 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
EXPECT_EQ(num_col_batch, 1) << "Expected number of batches to be 1";
delete dmat;
}
TEST(SimpleDMatrix, Empty) {
std::vector<float> data{};
std::vector<unsigned> feature_idx = {};
std::vector<size_t> row_ptr = {};
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(), 0, 0, 0);
data::SimpleDMatrix dmat(&csr_adapter,
std::numeric_limits<float>::quiet_NaN(), 1);
CHECK_EQ(dmat.Info().num_nonzero_, 0);
CHECK_EQ(dmat.Info().num_row_, 0);
CHECK_EQ(dmat.Info().num_col_, 0);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
CHECK_EQ(batch.Size(), 0);
}
data::DenseAdapter dense_adapter(nullptr, 0, 0, 0);
dmat = data::SimpleDMatrix(&dense_adapter,
std::numeric_limits<float>::quiet_NaN(), 1);
CHECK_EQ(dmat.Info().num_nonzero_, 0);
CHECK_EQ(dmat.Info().num_row_, 0);
CHECK_EQ(dmat.Info().num_col_, 0);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
CHECK_EQ(batch.Size(), 0);
}
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
dmat = data::SimpleDMatrix(&csc_adapter,
std::numeric_limits<float>::quiet_NaN(), 1);
CHECK_EQ(dmat.Info().num_nonzero_, 0);
CHECK_EQ(dmat.Info().num_row_, 0);
CHECK_EQ(dmat.Info().num_col_, 0);
for (auto &batch : dmat.GetBatches<SparsePage>()) {
CHECK_EQ(batch.Size(), 0);
}
}
TEST(SimpleDMatrix, MissingData) {
std::vector<float> data{0.0, std::nanf(""), 1.0};
std::vector<unsigned> feature_idx = {0, 1, 0};
std::vector<size_t> row_ptr = {0, 2, 3};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 3, 2);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
CHECK_EQ(dmat.Info().num_nonzero_, 2);
dmat = data::SimpleDMatrix(&adapter, 1.0, 1);
CHECK_EQ(dmat.Info().num_nonzero_, 1);
}
TEST(SimpleDMatrix, EmptyRow) {
std::vector<float> data{0.0, 1.0};
std::vector<unsigned> feature_idx = {0, 1};
std::vector<size_t> row_ptr = {0, 2, 2};
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 2, 2);
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
CHECK_EQ(dmat.Info().num_nonzero_, 2);
CHECK_EQ(dmat.Info().num_row_, 2);
CHECK_EQ(dmat.Info().num_col_, 2);
}