xgboost/tests/cpp/data/test_metainfo.cc
liuliang01 0cf88d036f Add qid like ranklib format (#2749)
* add qid for https://github.com/dmlc/xgboost/issues/2748

* change names

* change spaces

* change qid to bst_uint type

* change qid type to size_t

* change qid first to SIZE_MAX

* change qid type from size_t to uint64_t

* update dmlc-core

* fix qids name error

* fix group_ptr_ error

* Style fix

* Add qid handling logic to SparsePage

* New MetaInfo format + backward compatibility fix

Old MetaInfo format (1.0) doesn't contain qid field. We still want to be able
to read from MetaInfo files saved in old format. Also, define a new format
(2.0) that contains the qid field. This way, we can distinguish files that
contain qid and those that do not.

* Update MetaInfo test

* Simply group assignment logic

* Explicitly set qid=nullptr in NativeDataIter

NativeDataIter's callback does not support qid field. Users of NativeDataIter
will need to call setGroup() function separately to set group information.

* Save qids_ in SaveBinary()

* Upgrade dmlc-core submodule

* Add a test for reading qid

* Add contributor

* Check the size of qids_

* Document qid format
2018-06-30 20:24:03 +00:00

126 lines
4.2 KiB
C++

// Copyright by Contributors
#include <dmlc/io.h>
#include <xgboost/data.h>
#include <string>
#include <memory>
#include "../../../src/data/simple_csr_source.h"
#include "../helpers.h"
TEST(MetaInfo, GetSet) {
xgboost::MetaInfo info;
double double2[2] = {1.0, 2.0};
EXPECT_EQ(info.GetRoot(1), 0)
<< "When no root_index is given, was expecting default value 0";
info.SetInfo("root_index", double2, xgboost::kDouble, 2);
EXPECT_EQ(info.GetRoot(1), 2.0f);
EXPECT_EQ(info.labels_.size(), 0);
info.SetInfo("label", double2, xgboost::kFloat32, 2);
EXPECT_EQ(info.labels_.size(), 2);
float float2[2] = {1.0f, 2.0f};
EXPECT_EQ(info.GetWeight(1), 1.0f)
<< "When no weights are given, was expecting default value 1";
info.SetInfo("weight", float2, xgboost::kFloat32, 2);
EXPECT_EQ(info.GetWeight(1), 2.0f);
uint32_t uint32_t2[2] = {1U, 2U};
EXPECT_EQ(info.base_margin_.size(), 0);
info.SetInfo("base_margin", uint32_t2, xgboost::kUInt32, 2);
EXPECT_EQ(info.base_margin_.size(), 2);
uint64_t uint64_t2[2] = {1U, 2U};
EXPECT_EQ(info.group_ptr_.size(), 0);
info.SetInfo("group", uint64_t2, xgboost::kUInt64, 2);
ASSERT_EQ(info.group_ptr_.size(), 3);
EXPECT_EQ(info.group_ptr_[2], 3);
info.Clear();
ASSERT_EQ(info.group_ptr_.size(), 0);
}
TEST(MetaInfo, SaveLoadBinary) {
xgboost::MetaInfo info;
double vals[2] = {1.0, 2.0};
info.SetInfo("label", vals, xgboost::kDouble, 2);
info.num_row_ = 2;
info.num_col_ = 1;
std::string tmp_file = TempFileName();
dmlc::Stream * fs = dmlc::Stream::Create(tmp_file.c_str(), "w");
info.SaveBinary(fs);
delete fs;
ASSERT_EQ(GetFileSize(tmp_file), 84)
<< "Expected saved binary file size to be same as object size";
fs = dmlc::Stream::Create(tmp_file.c_str(), "r");
xgboost::MetaInfo inforead;
inforead.LoadBinary(fs);
EXPECT_EQ(inforead.labels_, info.labels_);
EXPECT_EQ(inforead.num_col_, info.num_col_);
EXPECT_EQ(inforead.num_row_, info.num_row_);
std::remove(tmp_file.c_str());
}
TEST(MetaInfo, LoadQid) {
std::string tmp_file = TempFileName();
{
std::unique_ptr<dmlc::Stream> fs(
dmlc::Stream::Create(tmp_file.c_str(), "w"));
dmlc::ostream os(fs.get());
os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
2 qid:1 1:0 2:0 3:1 4:0.1 5:1
1 qid:1 1:0 2:1 3:0 4:0.4 5:0
1 qid:1 1:0 2:0 3:1 4:0.3 5:0
1 qid:2 1:0 2:0 3:1 4:0.2 5:0
2 qid:2 1:1 2:0 3:1 4:0.4 5:0
1 qid:2 1:0 2:0 3:1 4:0.1 5:0
1 qid:2 1:0 2:0 3:1 4:0.2 5:0
2 qid:3 1:0 2:0 3:1 4:0.1 5:1
3 qid:3 1:1 2:1 3:0 4:0.3 5:0
4 qid:3 1:1 2:0 3:0 4:0.4 5:1
1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
os.set_stream(nullptr);
}
std::unique_ptr<xgboost::DMatrix> dmat(
xgboost::DMatrix::Load(tmp_file, true, false, "libsvm"));
std::remove(tmp_file.c_str());
const xgboost::MetaInfo& info = dmat->Info();
const std::vector<uint64_t> expected_qids{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
CHECK(info.qids_ == expected_qids);
CHECK(info.group_ptr_ == expected_group_ptr);
CHECK_GE(info.kVersion, info.kVersionQidAdded);
const std::vector<size_t> expected_offset{
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
};
const std::vector<xgboost::Entry> expected_data{
{1, 1}, {2, 1}, {3, 0}, {4, 0.2}, {5, 0},
{1, 0}, {2, 0}, {3, 1}, {4, 0.1}, {5, 1},
{1, 0}, {2, 1}, {3, 0}, {4, 0.4}, {5, 0},
{1, 0}, {2, 0}, {3, 1}, {4, 0.3}, {5, 0},
{1, 0}, {2, 0}, {3, 1}, {4, 0.2}, {5, 0},
{1, 1}, {2, 0}, {3, 1}, {4, 0.4}, {5, 0},
{1, 0}, {2, 0}, {3, 1}, {4, 0.1}, {5, 0},
{1, 0}, {2, 0}, {3, 1}, {4, 0.2}, {5, 0},
{1, 0}, {2, 0}, {3, 1}, {4, 0.1}, {5, 1},
{1, 1}, {2, 1}, {3, 0}, {4, 0.3}, {5, 0},
{1, 1}, {2, 0}, {3, 0}, {4, 0.4}, {5, 1},
{1, 0}, {2, 1}, {3, 1}, {4, 0.5}, {5, 0}
};
dmlc::DataIter<xgboost::SparsePage>* iter = dmat->RowIterator();
iter->BeforeFirst();
CHECK(iter->Next());
const xgboost::SparsePage& batch = iter->Value();
CHECK_EQ(batch.base_rowid, 0);
CHECK(batch.offset == expected_offset);
CHECK(batch.data == expected_data);
CHECK(!iter->Next());
}