xgboost/tests/cpp/data/test_metainfo.cc
Rong Ou da6803b75b
Support column-wise data split with in-memory inputs (#9628)
---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
2023-10-17 12:16:39 +08:00

355 lines
14 KiB
C++

// Copyright 2016-2021 by Contributors
#include "test_metainfo.h"
#include <dmlc/io.h>
#include <xgboost/data.h>
#include <memory>
#include <string>
#include "../../../src/common/version.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
#include "xgboost/base.h"
namespace xgboost {
TEST(MetaInfo, GetSet) {
xgboost::Context ctx;
xgboost::MetaInfo info;
double double2[2] = {1.0, 2.0};
EXPECT_EQ(info.labels.Size(), 0);
info.SetInfo(ctx, "label", double2, xgboost::DataType::kFloat32, 2);
EXPECT_EQ(info.labels.Size(), 2);
float float2[2] = {1.0f, 2.0f};
EXPECT_EQ(info.GetWeight(1), 1.0f)
<< "When no weights are given, was expecting default value 1";
info.SetInfo(ctx, "weight", float2, xgboost::DataType::kFloat32, 2);
EXPECT_EQ(info.GetWeight(1), 2.0f);
uint32_t uint32_t2[2] = {1U, 2U};
EXPECT_EQ(info.base_margin_.Size(), 0);
info.SetInfo(ctx, "base_margin", uint32_t2, xgboost::DataType::kUInt32, 2);
EXPECT_EQ(info.base_margin_.Size(), 2);
uint64_t uint64_t2[2] = {1U, 2U};
EXPECT_EQ(info.group_ptr_.size(), 0);
info.SetInfo(ctx, "group", uint64_t2, xgboost::DataType::kUInt64, 2);
ASSERT_EQ(info.group_ptr_.size(), 3);
EXPECT_EQ(info.group_ptr_[2], 3);
info.Clear();
ASSERT_EQ(info.group_ptr_.size(), 0);
}
TEST(MetaInfo, GetSetFeature) {
xgboost::MetaInfo info;
EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
EXPECT_NO_THROW(info.SetFeatureInfo("feature_type", nullptr, 0));
ASSERT_EQ(info.feature_type_names.size(), 0);
ASSERT_EQ(info.feature_types.Size(), 0);
ASSERT_EQ(info.feature_names.size(), 0);
size_t constexpr kCols = 19;
std::vector<std::string> types(kCols, u8"float");
std::vector<char const*> c_types(kCols);
std::transform(types.cbegin(), types.cend(), c_types.begin(),
[](auto const &str) { return str.c_str(); });
info.num_col_ = 1;
EXPECT_THROW(
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()),
dmlc::Error);
info.num_col_ = kCols;
EXPECT_NO_THROW(
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
// Test clear.
info.SetFeatureInfo("feature_type", nullptr, 0);
ASSERT_EQ(info.feature_type_names.size(), 0);
ASSERT_EQ(info.feature_types.Size(), 0);
// Other conditions are tested in `SaveLoadBinary`.
}
namespace {
void VerifyGetSetFeatureColumnSplit() {
xgboost::MetaInfo info;
info.data_split_mode = DataSplitMode::kCol;
auto const world_size = collective::GetWorldSize();
auto constexpr kCols{2};
std::vector<std::string> types{u8"float", u8"c"};
std::vector<char const *> c_types(kCols);
std::transform(types.cbegin(), types.cend(), c_types.begin(),
[](auto const &str) { return str.c_str(); });
info.num_col_ = kCols;
EXPECT_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), dmlc::Error);
info.num_col_ = kCols * world_size;
EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
std::vector<std::string> expected_type_names{u8"float", u8"c", u8"float",
u8"c", u8"float", u8"c"};
EXPECT_EQ(info.feature_type_names, expected_type_names);
std::vector<xgboost::FeatureType> expected_types{
xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical};
EXPECT_EQ(info.feature_types.HostVector(), expected_types);
std::vector<std::string> names{u8"feature0", u8"feature1"};
std::vector<char const *> c_names(kCols);
std::transform(names.cbegin(), names.cend(), c_names.begin(),
[](auto const &str) { return str.c_str(); });
info.num_col_ = kCols;
EXPECT_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()), dmlc::Error);
info.num_col_ = kCols * world_size;
EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()));
std::vector<std::string> expected_names{u8"0.feature0", u8"0.feature1", u8"1.feature0",
u8"1.feature1", u8"2.feature0", u8"2.feature1"};
EXPECT_EQ(info.feature_names, expected_names);
}
} // anonymous namespace
TEST(MetaInfo, GetSetFeatureColumnSplit) {
auto constexpr kWorldSize{3};
RunWithInMemoryCommunicator(kWorldSize, VerifyGetSetFeatureColumnSplit);
}
TEST(MetaInfo, SaveLoadBinary) {
xgboost::MetaInfo info;
xgboost::Context ctx;
uint64_t constexpr kRows { 64 }, kCols { 32 };
auto generator = []() {
static float f = 0;
return f++;
};
std::vector<float> values (kRows);
std::generate(values.begin(), values.end(), generator);
info.SetInfo(ctx, "label", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo(ctx, "weight", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo(ctx, "base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
info.num_row_ = kRows;
info.num_col_ = kCols;
auto featname = u8"特征名";
std::vector<std::string> types(kCols, u8"float");
std::vector<char const*> c_types(kCols);
std::transform(types.cbegin(), types.cend(), c_types.begin(),
[](auto const &str) { return str.c_str(); });
info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size());
std::vector<std::string> names(kCols, featname);
std::vector<char const*> c_names(kCols);
std::transform(names.cbegin(), names.cend(), c_names.begin(),
[](auto const &str) { return str.c_str(); });
info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());;
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/metainfo.binary";
{
std::unique_ptr<dmlc::Stream> fs {
dmlc::Stream::Create(tmp_file.c_str(), "w")
};
info.SaveBinary(fs.get());
}
{
// Round-trip test
std::unique_ptr<dmlc::Stream> fs {
dmlc::Stream::Create(tmp_file.c_str(), "r")
};
xgboost::MetaInfo inforead;
inforead.LoadBinary(fs.get());
ASSERT_EQ(inforead.num_row_, kRows);
EXPECT_EQ(inforead.num_row_, info.num_row_);
EXPECT_EQ(inforead.num_col_, info.num_col_);
EXPECT_EQ(inforead.num_nonzero_, info.num_nonzero_);
ASSERT_EQ(inforead.labels.Data()->HostVector(), values);
EXPECT_EQ(inforead.labels.Data()->HostVector(), info.labels.Data()->HostVector());
EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
auto orig_margin = info.base_margin_.View(xgboost::DeviceOrd::CPU());
auto read_margin = inforead.base_margin_.View(xgboost::DeviceOrd::CPU());
EXPECT_TRUE(std::equal(orig_margin.Values().cbegin(), orig_margin.Values().cend(),
read_margin.Values().cbegin()));
EXPECT_EQ(inforead.feature_type_names.size(), kCols);
EXPECT_EQ(inforead.feature_types.Size(), kCols);
EXPECT_TRUE(std::all_of(inforead.feature_type_names.cbegin(),
inforead.feature_type_names.cend(),
[](auto const &str) { return str == u8"float"; }));
auto h_ft = inforead.feature_types.HostSpan();
EXPECT_TRUE(std::all_of(h_ft.cbegin(), h_ft.cend(), [](auto f) {
return f == xgboost::FeatureType::kNumerical;
}));
EXPECT_EQ(inforead.feature_names.size(), kCols);
EXPECT_TRUE(std::all_of(inforead.feature_names.cbegin(),
inforead.feature_names.cend(),
[=](auto const& str) {
return str == featname;
}));
}
}
TEST(MetaInfo, LoadQid) {
dmlc::TemporaryDirectory tempdir;
std::string tmp_file = tempdir.path + "/qid_test.libsvm";
{
std::unique_ptr<dmlc::Stream> fs(dmlc::Stream::Create(tmp_file.c_str(), "w"));
dmlc::ostream os(fs.get());
os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
2 qid:1 1:0 2:0 3:1 4:0.1 5:1
1 qid:1 1:0 2:1 3:0 4:0.4 5:0
1 qid:1 1:0 2:0 3:1 4:0.3 5:0
1 qid:2 1:0 2:0 3:1 4:0.2 5:0
2 qid:2 1:1 2:0 3:1 4:0.4 5:0
1 qid:2 1:0 2:0 3:1 4:0.1 5:0
1 qid:2 1:0 2:0 3:1 4:0.2 5:0
2 qid:3 1:0 2:0 3:1 4:0.1 5:1
3 qid:3 1:1 2:1 3:0 4:0.3 5:0
4 qid:3 1:1 2:0 3:0 4:0.4 5:1
1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
os.set_stream(nullptr);
}
std::unique_ptr<xgboost::DMatrix> dmat(
xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow));
const xgboost::MetaInfo& info = dmat->Info();
const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
CHECK(info.group_ptr_ == expected_group_ptr);
const std::vector<xgboost::bst_row_t> expected_offset{
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
};
const std::vector<xgboost::Entry> expected_data{
xgboost::Entry(1, 1), xgboost::Entry(2, 1), xgboost::Entry(3, 0),
xgboost::Entry(4, 0.2), xgboost::Entry(5, 0), xgboost::Entry(1, 0),
xgboost::Entry(2, 0), xgboost::Entry(3, 1), xgboost::Entry(4, 0.1),
xgboost::Entry(5, 1), xgboost::Entry(1, 0), xgboost::Entry(2, 1),
xgboost::Entry(3, 0), xgboost::Entry(4, 0.4), xgboost::Entry(5, 0),
xgboost::Entry(1, 0), xgboost::Entry(2, 0), xgboost::Entry(3, 1),
xgboost::Entry(4, 0.3), xgboost::Entry(5, 0), xgboost::Entry(1, 0),
xgboost::Entry(2, 0), xgboost::Entry(3, 1), xgboost::Entry(4, 0.2),
xgboost::Entry(5, 0), xgboost::Entry(1, 1), xgboost::Entry(2, 0),
xgboost::Entry(3, 1), xgboost::Entry(4, 0.4), xgboost::Entry(5, 0),
xgboost::Entry(1, 0), xgboost::Entry(2, 0), xgboost::Entry(3, 1),
xgboost::Entry(4, 0.1), xgboost::Entry(5, 0), xgboost::Entry(1, 0),
xgboost::Entry(2, 0), xgboost::Entry(3, 1), xgboost::Entry(4, 0.2),
xgboost::Entry(5, 0), xgboost::Entry(1, 0), xgboost::Entry(2, 0),
xgboost::Entry(3, 1), xgboost::Entry(4, 0.1), xgboost::Entry(5, 1),
xgboost::Entry(1, 1), xgboost::Entry(2, 1), xgboost::Entry(3, 0),
xgboost::Entry(4, 0.3), xgboost::Entry(5, 0), xgboost::Entry(1, 1),
xgboost::Entry(2, 0), xgboost::Entry(3, 0), xgboost::Entry(4, 0.4),
xgboost::Entry(5, 1), xgboost::Entry(1, 0), xgboost::Entry(2, 1),
xgboost::Entry(3, 1), xgboost::Entry(4, 0.5), {5, 0}};
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
CHECK_EQ(batch.base_rowid, 0);
CHECK(batch.offset.HostVector() == expected_offset);
CHECK(batch.data.HostVector() == expected_data);
}
}
TEST(MetaInfo, CPUQid) {
xgboost::MetaInfo info;
xgboost::Context ctx;
info.num_row_ = 100;
std::vector<uint32_t> qid(info.num_row_, 0);
for (size_t i = 0; i < qid.size(); ++i) {
qid[i] = i;
}
info.SetInfo(ctx, "qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
ASSERT_EQ(info.group_ptr_.front(), 0);
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
for (size_t i = 0; i < info.num_row_ + 1; ++i) {
ASSERT_EQ(info.group_ptr_[i], i);
}
}
TEST(MetaInfo, Validate) {
xgboost::MetaInfo info;
info.num_row_ = 10;
info.num_nonzero_ = 12;
info.num_col_ = 3;
std::vector<xgboost::bst_group_t> groups (11);
Context ctx;
info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
std::vector<float> labels(info.num_row_ + 1);
EXPECT_THROW(
{
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_ + 1);
},
dmlc::Error);
// Make overflow data, which can happen when users pass group structure as int
// or float.
groups = {};
for (size_t i = 0; i < 63; ++i) {
groups.push_back(1562500);
}
groups.push_back(static_cast<xgboost::bst_group_t>(-1));
EXPECT_THROW(info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size()),
dmlc::Error);
#if defined(XGBOOST_USE_CUDA)
info.group_ptr_.clear();
labels.resize(info.num_row_);
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
info.labels.SetDevice(FstCU());
EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
d_groups.SetDevice(FstCU());
d_groups.DevicePointer(); // pull to device
std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
EXPECT_THROW(info.SetInfo(ctx, "group", xgboost::StringView{arr_interface_str}), dmlc::Error);
#endif // defined(XGBOOST_USE_CUDA)
}
TEST(MetaInfo, HostExtend) {
xgboost::MetaInfo lhs, rhs;
xgboost::Context ctx;
size_t const kRows = 100;
lhs.labels.Reshape(kRows);
lhs.num_row_ = kRows;
rhs.labels.Reshape(kRows);
rhs.num_row_ = kRows;
ASSERT_TRUE(lhs.labels.Data()->HostCanRead());
ASSERT_TRUE(rhs.labels.Data()->HostCanRead());
size_t per_group = 10;
std::vector<xgboost::bst_group_t> groups;
for (size_t g = 0; g < kRows / per_group; ++g) {
groups.emplace_back(per_group);
}
lhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
rhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
lhs.Extend(rhs, true, true);
ASSERT_EQ(lhs.num_row_, kRows * 2);
ASSERT_TRUE(lhs.labels.Data()->HostCanRead());
ASSERT_TRUE(rhs.labels.Data()->HostCanRead());
ASSERT_FALSE(lhs.labels.Data()->DeviceCanRead());
ASSERT_FALSE(rhs.labels.Data()->DeviceCanRead());
ASSERT_EQ(lhs.group_ptr_.front(), 0);
ASSERT_EQ(lhs.group_ptr_.back(), kRows * 2);
for (size_t i = 0; i < kRows * 2 / per_group; ++i) {
ASSERT_EQ(lhs.group_ptr_.at(i), per_group * i);
}
}
TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
} // namespace xgboost