Add number of columns to native data iterator. (#5202)
* Change native data iter into an adapter.
This commit is contained in:
@@ -1,18 +1,24 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/common/timer.h"
|
||||
#include "../helpers.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
TEST(adapter, CSRAdapter) {
|
||||
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/c_api.h"
|
||||
|
||||
namespace xgboost {
|
||||
TEST(Adapter, CSRAdapter) {
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5};
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0, 1, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 4, 5};
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(),
|
||||
row_ptr.size() - 1, data.size(), n);
|
||||
row_ptr.size() - 1, data.size(), n);
|
||||
adapter.Next();
|
||||
auto & batch = adapter.Value();
|
||||
auto line0 = batch.GetLine(0);
|
||||
@@ -28,7 +34,7 @@ TEST(adapter, CSRAdapter) {
|
||||
EXPECT_EQ(line2 .GetElement(0).column_idx, 1);
|
||||
}
|
||||
|
||||
TEST(adapter, CSCAdapterColsMoreThanRows) {
|
||||
TEST(Adapter, CSCAdapterColsMoreThanRows) {
|
||||
std::vector<float> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
std::vector<unsigned> row_idx = {0, 1, 0, 1, 0, 1, 0, 1};
|
||||
std::vector<size_t> col_ptr = {0, 2, 4, 6, 8};
|
||||
@@ -88,3 +94,67 @@ TEST(c_api, DMatrixSliceAdapterFromSimpleDMatrix) {
|
||||
|
||||
delete pp_dmat;
|
||||
}
|
||||
|
||||
// A mock for JVM data iterator.
|
||||
class DataIterForTest {
|
||||
std::vector<float> data_ {1, 2, 3, 4, 5};
|
||||
std::vector<std::remove_pointer<decltype(std::declval<XGBoostBatchCSR>().index)>::type>
|
||||
feature_idx_ {0, 1, 0, 1, 1};
|
||||
std::vector<std::remove_pointer<decltype(std::declval<XGBoostBatchCSR>().offset)>::type>
|
||||
row_ptr_ {0, 2, 4, 5};
|
||||
size_t iter_ {0};
|
||||
|
||||
public:
|
||||
size_t static constexpr kCols { 13 }; // Test for having some missing columns
|
||||
|
||||
XGBoostBatchCSR Next() {
|
||||
for (auto& v : data_) {
|
||||
v += iter_;
|
||||
}
|
||||
XGBoostBatchCSR batch;
|
||||
batch.columns = 2;
|
||||
batch.offset = dmlc::BeginPtr(row_ptr_);
|
||||
batch.index = dmlc::BeginPtr(feature_idx_);
|
||||
batch.value = dmlc::BeginPtr(data_);
|
||||
batch.size = 3;
|
||||
|
||||
batch.label = nullptr;
|
||||
batch.weight = nullptr;
|
||||
|
||||
iter_++;
|
||||
|
||||
return batch;
|
||||
}
|
||||
size_t Iter() const { return iter_; }
|
||||
};
|
||||
|
||||
size_t constexpr DataIterForTest::kCols;
|
||||
|
||||
int SetDataNextForTest(DataIterHandle data_handle,
|
||||
XGBCallbackSetData *set_function,
|
||||
DataHolderHandle set_function_handle) {
|
||||
size_t constexpr kIters { 2 };
|
||||
auto iter = static_cast<DataIterForTest *>(data_handle);
|
||||
if (iter->Iter() < kIters) {
|
||||
auto batch = iter->Next();
|
||||
batch.columns = DataIterForTest::kCols;
|
||||
set_function(set_function_handle, batch);
|
||||
return 1;
|
||||
} else {
|
||||
return 0; // stoping condition
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Adapter, IteratorAdaper) {
|
||||
DataIterForTest iter;
|
||||
data::IteratorAdapter adapter{&iter, SetDataNextForTest};
|
||||
constexpr size_t kRows { 6 };
|
||||
|
||||
std::unique_ptr<DMatrix> data {
|
||||
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
|
||||
};
|
||||
ASSERT_EQ(data->Info().num_col_, DataIterForTest::kCols);
|
||||
ASSERT_EQ(data->Info().num_row_, kRows);
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../helpers.h"
|
||||
#include "xgboost/base.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
@@ -185,10 +186,8 @@ TEST(SimpleDMatrix, FromFile) {
|
||||
CreateBigTestData(filename, 3 * 5);
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
|
||||
dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1, "auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
1);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
|
||||
auto verify_batch = [](SparsePage const &batch) {
|
||||
EXPECT_EQ(batch.Size(), 5);
|
||||
EXPECT_EQ(batch.offset.HostVector(),
|
||||
std::vector<bst_row_t>({0, 3, 6, 9, 12, 15}));
|
||||
@@ -205,6 +204,16 @@ TEST(SimpleDMatrix, FromFile) {
|
||||
EXPECT_EQ(batch[i][2].index, 4);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
constexpr bst_feature_t kCols = 5;
|
||||
data::FileAdapter adapter(parser.get());
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
1);
|
||||
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
verify_batch(batch);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -263,8 +263,10 @@ TEST(SparsePageDMatrix, FromFile) {
|
||||
data::FileAdapter adapter(parser.get());
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
|
||||
data::SparsePageDMatrix dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), -1, tmp_file, 1);
|
||||
ASSERT_EQ(dmat.Info().num_col_, 5);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
std::vector<bst_row_t> expected_offset(batch.Size() + 1);
|
||||
|
||||
Reference in New Issue
Block a user