Add data split mode to DMatrix MetaInfo (#8568)
This commit is contained in:
@@ -185,6 +185,17 @@ TEST(CAPI, CatchDMLCError) {
|
||||
EXPECT_THROW({ dmlc::Stream::Create("foo", "r"); }, dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(CAPI, CatchDMLCErrorURI) {
|
||||
Json config{Object()};
|
||||
config["uri"] = String{"foo"};
|
||||
config["silent"] = Integer{0};
|
||||
std::string config_str;
|
||||
Json::Dump(config, &config_str);
|
||||
DMatrixHandle out;
|
||||
ASSERT_EQ(XGDMatrixCreateFromURI(config_str.c_str(), &out), -1);
|
||||
EXPECT_THROW({ dmlc::Stream::Create("foo", "r"); }, dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(CAPI, DMatrixSetFeatureName) {
|
||||
size_t constexpr kRows = 10;
|
||||
bst_feature_t constexpr kCols = 2;
|
||||
|
||||
@@ -88,8 +88,7 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
|
||||
fo << row_data.str() << "\n";
|
||||
}
|
||||
fo.close();
|
||||
return std::shared_ptr<DMatrix>(DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, DataSplitMode::kNone, "auto"));
|
||||
return std::shared_ptr<DMatrix>(DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"));
|
||||
}
|
||||
|
||||
// Test that elements are approximately equally distributed among bins
|
||||
|
||||
@@ -27,7 +27,6 @@ std::string GetModelStr() {
|
||||
"train_parameter": {
|
||||
"debug_verbose": "0",
|
||||
"disable_default_eval_metric": "0",
|
||||
"dsplit": "auto",
|
||||
"nthread": "0",
|
||||
"seed": "0",
|
||||
"seed_per_iteration": "0",
|
||||
|
||||
@@ -143,7 +143,7 @@ TEST(DMatrix, Uri) {
|
||||
// EXPECT_THROW(dmat.reset(DMatrix::Load(path, false, true)), dmlc::Error);
|
||||
|
||||
std::string uri = path + "?format=csv";
|
||||
dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kRow));
|
||||
dmat.reset(DMatrix::Load(uri, false));
|
||||
|
||||
ASSERT_EQ(dmat->Info().num_col_, kCols);
|
||||
ASSERT_EQ(dmat->Info().num_row_, kRows);
|
||||
|
||||
@@ -175,7 +175,7 @@ TEST(MetaInfo, LoadQid) {
|
||||
os.set_stream(nullptr);
|
||||
}
|
||||
std::unique_ptr<xgboost::DMatrix> dmat(
|
||||
xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kNone, "libsvm"));
|
||||
xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm"));
|
||||
|
||||
const xgboost::MetaInfo& info = dmat->Info();
|
||||
const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
|
||||
|
||||
@@ -15,13 +15,14 @@ TEST(SimpleDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
|
||||
|
||||
// Test the metadata that was parsed
|
||||
EXPECT_EQ(dmat->Info().num_row_, 2);
|
||||
EXPECT_EQ(dmat->Info().num_col_, 5);
|
||||
EXPECT_EQ(dmat->Info().num_nonzero_, 6);
|
||||
EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_);
|
||||
EXPECT_EQ(dmat->Info().data_split_mode, DataSplitMode::kRow);
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
@@ -30,7 +31,7 @@ TEST(SimpleDMatrix, RowAccess) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false);
|
||||
|
||||
// Loop over the batches and count the records
|
||||
int64_t row_count = 0;
|
||||
@@ -53,7 +54,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
|
||||
|
||||
ASSERT_TRUE(dmat->SingleColBlock());
|
||||
|
||||
@@ -360,6 +361,7 @@ TEST(SimpleDMatrix, SliceCol) {
|
||||
ASSERT_EQ(out->Info().num_col_, out->Info().num_col_);
|
||||
ASSERT_EQ(out->Info().num_row_, kRows);
|
||||
ASSERT_EQ(out->Info().num_nonzero_, kRows * kSlicCols); // dense
|
||||
ASSERT_EQ(out->Info().data_split_mode, DataSplitMode::kCol);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -367,12 +369,12 @@ TEST(SimpleDMatrix, SaveLoadBinary) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file);
|
||||
data::SimpleDMatrix *simple_dmat = dynamic_cast<data::SimpleDMatrix*>(dmat);
|
||||
|
||||
const std::string tmp_binfile = tempdir.path + "/csr_source.binary";
|
||||
simple_dmat->SaveToLocalFile(tmp_binfile);
|
||||
xgboost::DMatrix * dmat_read = xgboost::DMatrix::Load(tmp_binfile, true, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix * dmat_read = xgboost::DMatrix::Load(tmp_binfile);
|
||||
|
||||
EXPECT_EQ(dmat->Info().num_col_, dmat_read->Info().num_col_);
|
||||
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
|
||||
|
||||
@@ -108,8 +108,7 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
||||
size_t constexpr kEntries = 24;
|
||||
CreateBigTestData(tmp_file, kEntries);
|
||||
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", false, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false);
|
||||
|
||||
// Test the metadata that was parsed
|
||||
EXPECT_EQ(dmat->Info().num_row_, 8ul);
|
||||
@@ -136,8 +135,7 @@ TEST(SparsePageDMatrix, ColAccess) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
xgboost::DMatrix *dmat =
|
||||
xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, xgboost::DataSplitMode::kNone);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
size_t iter = 0;
|
||||
|
||||
@@ -12,7 +12,7 @@ TEST(SparsePageDMatrix, EllpackPage) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, DataSplitMode::kNone);
|
||||
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
size_t n = 0;
|
||||
|
||||
@@ -527,8 +527,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
||||
if (page_size > 0) {
|
||||
uri += "#" + tmp_file + ".cache";
|
||||
}
|
||||
std::unique_ptr<DMatrix> dmat(
|
||||
DMatrix::Load(uri, true, DataSplitMode::kNone, "auto"));
|
||||
std::unique_ptr<DMatrix> dmat(DMatrix::Load(uri));
|
||||
return dmat;
|
||||
}
|
||||
|
||||
|
||||
@@ -98,8 +98,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
||||
CreateBigTestData(tmp_file, 50000);
|
||||
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, DataSplitMode::kNone, "auto"));
|
||||
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"));
|
||||
EXPECT_FALSE(dmat->SingleColBlock());
|
||||
size_t num_row = dmat->Info().num_row_;
|
||||
std::vector<bst_float> labels(num_row);
|
||||
|
||||
Reference in New Issue
Block a user