[Breaking] Require format to be specified in input URI. (#9077)
Previously, we use `libsvm` as default when format is not specified. However, the dmlc data parser is not particularly robust against errors, and the most common type of error is undefined format. Along with which, we will recommend users to use other data loader instead. We will continue the maintenance of the parsers as it's currently used for many internal tests including federated learning.
This commit is contained in:
@@ -29,16 +29,16 @@ TEST(FileIterator, Basic) {
|
||||
{
|
||||
auto zpath = tmpdir.path + "/0-based.svm";
|
||||
CreateBigTestData(zpath, 3 * 64, true);
|
||||
zpath += "?indexing_mode=0";
|
||||
FileIterator iter{zpath, 0, 1, "libsvm"};
|
||||
zpath += "?indexing_mode=0&format=libsvm";
|
||||
FileIterator iter{zpath, 0, 1};
|
||||
check_n_features(&iter);
|
||||
}
|
||||
|
||||
{
|
||||
auto opath = tmpdir.path + "/1-based.svm";
|
||||
CreateBigTestData(opath, 3 * 64, false);
|
||||
opath += "?indexing_mode=1";
|
||||
FileIterator iter{opath, 0, 1, "libsvm"};
|
||||
opath += "?indexing_mode=1&format=libsvm";
|
||||
FileIterator iter{opath, 0, 1};
|
||||
check_n_features(&iter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,8 +157,7 @@ TEST(MetaInfo, LoadQid) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
std::string tmp_file = tempdir.path + "/qid_test.libsvm";
|
||||
{
|
||||
std::unique_ptr<dmlc::Stream> fs(
|
||||
dmlc::Stream::Create(tmp_file.c_str(), "w"));
|
||||
std::unique_ptr<dmlc::Stream> fs(dmlc::Stream::Create(tmp_file.c_str(), "w"));
|
||||
dmlc::ostream os(fs.get());
|
||||
os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
|
||||
2 qid:1 1:0 2:0 3:1 4:0.1 5:1
|
||||
@@ -175,7 +174,7 @@ TEST(MetaInfo, LoadQid) {
|
||||
os.set_stream(nullptr);
|
||||
}
|
||||
std::unique_ptr<xgboost::DMatrix> dmat(
|
||||
xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm"));
|
||||
xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow));
|
||||
|
||||
const xgboost::MetaInfo& info = dmat->Info();
|
||||
const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
|
||||
|
||||
@@ -17,11 +17,15 @@
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
namespace {
|
||||
std::string UriSVM(std::string name) { return name + "?format=libsvm"; }
|
||||
} // namespace
|
||||
|
||||
TEST(SimpleDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
|
||||
|
||||
// Test the metadata that was parsed
|
||||
EXPECT_EQ(dmat->Info().num_row_, 2);
|
||||
@@ -37,7 +41,7 @@ TEST(SimpleDMatrix, RowAccess) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false);
|
||||
|
||||
// Loop over the batches and count the records
|
||||
int64_t row_count = 0;
|
||||
@@ -60,7 +64,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
|
||||
|
||||
ASSERT_TRUE(dmat->SingleColBlock());
|
||||
|
||||
@@ -387,7 +391,7 @@ TEST(SimpleDMatrix, SaveLoadBinary) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
|
||||
data::SimpleDMatrix *simple_dmat = dynamic_cast<data::SimpleDMatrix*>(dmat);
|
||||
|
||||
const std::string tmp_binfile = tempdir.path + "/csr_source.binary";
|
||||
|
||||
@@ -16,14 +16,19 @@
|
||||
#include "../helpers.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
namespace {
|
||||
std::string UriSVM(std::string name, std::string cache) {
|
||||
return name + "?format=libsvm" + "#" + cache + ".cache";
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <typename Page>
|
||||
void TestSparseDMatrixLoadFile() {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto opath = tmpdir.path + "/1-based.svm";
|
||||
CreateBigTestData(opath, 3 * 64, false);
|
||||
opath += "?indexing_mode=1";
|
||||
data::FileIterator iter{opath, 0, 1, "libsvm"};
|
||||
opath += "?indexing_mode=1&format=libsvm";
|
||||
data::FileIterator iter{opath, 0, 1};
|
||||
auto n_threads = 0;
|
||||
data::SparsePageDMatrix m{&iter,
|
||||
iter.Proxy(),
|
||||
@@ -112,15 +117,13 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
size_t constexpr kEntries = 24;
|
||||
CreateBigTestData(tmp_file, kEntries);
|
||||
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false);
|
||||
std::unique_ptr<DMatrix> dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)};
|
||||
|
||||
// Test the metadata that was parsed
|
||||
EXPECT_EQ(dmat->Info().num_row_, 8ul);
|
||||
EXPECT_EQ(dmat->Info().num_col_, 5ul);
|
||||
EXPECT_EQ(dmat->Info().num_nonzero_, kEntries);
|
||||
EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_);
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, RowAccess) {
|
||||
@@ -139,7 +142,7 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file));
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
size_t iter = 0;
|
||||
@@ -231,7 +234,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
|
||||
std::string filename = tempdir.path + "/simple.libsvm";
|
||||
CreateBigTestData(filename, 1 << 16);
|
||||
|
||||
data::FileIterator iter(filename, 0, 1, "auto");
|
||||
data::FileIterator iter(filename + "?format=libsvm", 0, 1);
|
||||
std::unique_ptr<DMatrix> sparse{
|
||||
new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
|
||||
std::numeric_limits<float>::quiet_NaN(), threads, filename}};
|
||||
|
||||
@@ -13,7 +13,7 @@ TEST(SparsePageDMatrix, EllpackPage) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
|
||||
DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache");
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
size_t n = 0;
|
||||
|
||||
Reference in New Issue
Block a user