Dmatrix refactor stage 1 (#3301)
* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
This commit is contained in:
@@ -29,7 +29,7 @@ TEST(c_api, XGDMatrixCreateFromMat_omp) {
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
auto batch = iter->Value();
|
||||
for (int i = 0; i < batch.size; i++) {
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (int j = 0; i < inst.length; i++) {
|
||||
ASSERT_EQ(inst[j].fvalue, 1.5);
|
||||
|
||||
@@ -18,13 +18,13 @@ TEST(SimpleCSRSource, SaveLoadBinary) {
|
||||
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
|
||||
EXPECT_EQ(dmat->Info().num_row_, dmat_read->Info().num_row_);
|
||||
|
||||
dmlc::DataIter<xgboost::RowBatch> * row_iter = dmat->RowIterator();
|
||||
dmlc::DataIter<xgboost::RowBatch> * row_iter_read = dmat_read->RowIterator();
|
||||
auto row_iter = dmat->RowIterator();
|
||||
auto row_iter_read = dmat_read->RowIterator();
|
||||
// Test the data read into the first row
|
||||
row_iter->BeforeFirst(); row_iter->Next();
|
||||
row_iter_read->BeforeFirst(); row_iter_read->Next();
|
||||
xgboost::SparseBatch::Inst first_row = row_iter->Value()[0];
|
||||
xgboost::SparseBatch::Inst first_row_read = row_iter_read->Value()[0];
|
||||
auto first_row = row_iter->Value()[0];
|
||||
auto first_row_read = row_iter_read->Value()[0];
|
||||
EXPECT_EQ(first_row.length, first_row_read.length);
|
||||
EXPECT_EQ(first_row[2].index, first_row_read[2].index);
|
||||
EXPECT_EQ(first_row[2].fvalue, first_row_read[2].fvalue);
|
||||
|
||||
@@ -18,19 +18,19 @@ TEST(SimpleDMatrix, MetaInfo) {
|
||||
|
||||
TEST(SimpleDMatrix, RowAccess) {
|
||||
std::string tmp_file = CreateSimpleTestData();
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
|
||||
dmlc::DataIter<xgboost::RowBatch> * row_iter = dmat->RowIterator();
|
||||
auto row_iter = dmat->RowIterator();
|
||||
// Loop over the batches and count the records
|
||||
long row_count = 0;
|
||||
row_iter->BeforeFirst();
|
||||
while (row_iter->Next()) row_count += row_iter->Value().size;
|
||||
while (row_iter->Next()) row_count += row_iter->Value().Size();
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
// Test the data read into the first row
|
||||
row_iter->BeforeFirst();
|
||||
row_iter->Next();
|
||||
xgboost::SparseBatch::Inst first_row = row_iter->Value()[0];
|
||||
auto first_row = row_iter->Value()[0];
|
||||
ASSERT_EQ(first_row.length, 3);
|
||||
EXPECT_EQ(first_row[2].index, 2);
|
||||
EXPECT_EQ(first_row[2].fvalue, 20);
|
||||
@@ -45,14 +45,14 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
// Unsorted column access
|
||||
const std::vector<bool> enable(dmat->Info().num_col_, true);
|
||||
EXPECT_EQ(dmat->HaveColAccess(false), false);
|
||||
dmat->InitColAccess(enable, 1, dmat->Info().num_row_, false);
|
||||
dmat->InitColAccess(enable, 0, 0, false); // Calling it again should not change it
|
||||
dmat->InitColAccess(dmat->Info().num_row_, false);
|
||||
dmat->InitColAccess(0, false); // Calling it again should not change it
|
||||
ASSERT_EQ(dmat->HaveColAccess(false), true);
|
||||
|
||||
// Sorted column access
|
||||
EXPECT_EQ(dmat->HaveColAccess(true), false);
|
||||
dmat->InitColAccess(enable, 1, dmat->Info().num_row_, true);
|
||||
dmat->InitColAccess(enable, 0, 0, true); // Calling it again should not change it
|
||||
dmat->InitColAccess(dmat->Info().num_row_, true);
|
||||
dmat->InitColAccess(0, true); // Calling it again should not change it
|
||||
ASSERT_EQ(dmat->HaveColAccess(true), true);
|
||||
|
||||
EXPECT_EQ(dmat->GetColSize(0), 2);
|
||||
@@ -61,84 +61,19 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
|
||||
ASSERT_TRUE(dmat->SingleColBlock());
|
||||
|
||||
dmlc::DataIter<xgboost::ColBatch> * col_iter = dmat->ColIterator();
|
||||
auto* col_iter = dmat->ColIterator();
|
||||
// Loop over the batches and assert the data is as expected
|
||||
long num_col_batch = 0;
|
||||
col_iter->BeforeFirst();
|
||||
while (col_iter->Next()) {
|
||||
num_col_batch += 1;
|
||||
EXPECT_EQ(col_iter->Value().size, dmat->Info().num_col_)
|
||||
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
|
||||
<< "Expected batch size = number of cells as #batches is 1.";
|
||||
for (int i = 0; i < static_cast<int>(col_iter->Value().size); ++i) {
|
||||
for (int i = 0; i < static_cast<int>(col_iter->Value().Size()); ++i) {
|
||||
EXPECT_EQ(col_iter->Value()[i].length, dmat->GetColSize(i))
|
||||
<< "Expected length of each colbatch = colsize as #batches is 1.";
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(num_col_batch, 1) << "Expected number of batches to be 1";
|
||||
col_iter = nullptr;
|
||||
|
||||
std::vector<xgboost::bst_uint> sub_feats = {4, 3};
|
||||
dmlc::DataIter<xgboost::ColBatch> * sub_col_iter = dmat->ColIterator(sub_feats);
|
||||
// Loop over the batches and assert the data is as expected
|
||||
sub_col_iter->BeforeFirst();
|
||||
while (sub_col_iter->Next()) {
|
||||
EXPECT_EQ(sub_col_iter->Value().size, sub_feats.size())
|
||||
<< "Expected size of a batch = number of cells in subset as #batches is 1.";
|
||||
}
|
||||
sub_col_iter = nullptr;
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, ColAccessWithBatches) {
|
||||
std::string tmp_file = CreateSimpleTestData();
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, true, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
|
||||
// Unsorted column access
|
||||
const std::vector<bool> enable(dmat->Info().num_col_, true);
|
||||
EXPECT_EQ(dmat->HaveColAccess(false), false);
|
||||
dmat->InitColAccess(enable, 1, 1, false);
|
||||
dmat->InitColAccess(enable, 0, 0, false); // Calling it again should not change it
|
||||
ASSERT_EQ(dmat->HaveColAccess(false), true);
|
||||
|
||||
// Sorted column access
|
||||
EXPECT_EQ(dmat->HaveColAccess(true), false);
|
||||
dmat->InitColAccess(enable, 1, 1, true); // Max 1 row per patch
|
||||
dmat->InitColAccess(enable, 0, 0, true); // Calling it again should not change it
|
||||
ASSERT_EQ(dmat->HaveColAccess(true), true);
|
||||
|
||||
EXPECT_EQ(dmat->GetColSize(0), 2);
|
||||
EXPECT_EQ(dmat->GetColSize(1), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(0), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
|
||||
ASSERT_FALSE(dmat->SingleColBlock());
|
||||
|
||||
dmlc::DataIter<xgboost::ColBatch> * col_iter = dmat->ColIterator();
|
||||
// Loop over the batches and assert the data is as expected
|
||||
long num_col_batch = 0;
|
||||
col_iter->BeforeFirst();
|
||||
while (col_iter->Next()) {
|
||||
num_col_batch += 1;
|
||||
EXPECT_EQ(col_iter->Value().size, dmat->Info().num_col_)
|
||||
<< "Expected batch size = num_cols as max_row_perbatch is 1.";
|
||||
for (int i = 0; i < static_cast<int>(col_iter->Value().size); ++i) {
|
||||
EXPECT_LE(col_iter->Value()[i].length, 1)
|
||||
<< "Expected length of each colbatch <=1 as max_row_perbatch is 1.";
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(num_col_batch, dmat->Info().num_row_)
|
||||
<< "Expected num batches = num_rows as max_row_perbatch is 1";
|
||||
col_iter = nullptr;
|
||||
|
||||
// The iterator feats should ignore any numbers larger than the num_col
|
||||
std::vector<xgboost::bst_uint> sub_feats = {
|
||||
4, 3, static_cast<unsigned int>(dmat->Info().num_col_ + 1)};
|
||||
dmlc::DataIter<xgboost::ColBatch> * sub_col_iter = dmat->ColIterator(sub_feats);
|
||||
// Loop over the batches and assert the data is as expected
|
||||
sub_col_iter->BeforeFirst();
|
||||
while (sub_col_iter->Next()) {
|
||||
EXPECT_EQ(sub_col_iter->Value().size, sub_feats.size() - 1)
|
||||
<< "Expected size of a batch = number of columns in subset "
|
||||
<< "as max_row_perbatch is 1.";
|
||||
}
|
||||
sub_col_iter = nullptr;
|
||||
}
|
||||
|
||||
@@ -7,8 +7,9 @@
|
||||
TEST(SparsePageDMatrix, MetaInfo) {
|
||||
std::string tmp_file = CreateSimpleTestData();
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
tmp_file + "#" + tmp_file + ".cache", false, false);
|
||||
std::remove(tmp_file.c_str());
|
||||
std::cout << tmp_file << std::endl;
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
|
||||
// Test the metadata that was parsed
|
||||
@@ -29,16 +30,16 @@ TEST(SparsePageDMatrix, RowAccess) {
|
||||
std::remove(tmp_file.c_str());
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
|
||||
dmlc::DataIter<xgboost::RowBatch> * row_iter = dmat->RowIterator();
|
||||
auto row_iter = dmat->RowIterator();
|
||||
// Loop over the batches and count the records
|
||||
long row_count = 0;
|
||||
row_iter->BeforeFirst();
|
||||
while (row_iter->Next()) row_count += row_iter->Value().size;
|
||||
while (row_iter->Next()) row_count += row_iter->Value().Size();
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
// Test the data read into the first row
|
||||
row_iter->BeforeFirst();
|
||||
row_iter->Next();
|
||||
xgboost::SparseBatch::Inst first_row = row_iter->Value()[0];
|
||||
auto first_row = row_iter->Value()[0];
|
||||
ASSERT_EQ(first_row.length, 3);
|
||||
EXPECT_EQ(first_row[2].index, 2);
|
||||
EXPECT_EQ(first_row[2].fvalue, 20);
|
||||
@@ -58,7 +59,7 @@ TEST(SparsePageDMatrix, ColAcess) {
|
||||
|
||||
EXPECT_EQ(dmat->HaveColAccess(true), false);
|
||||
const std::vector<bool> enable(dmat->Info().num_col_, true);
|
||||
dmat->InitColAccess(enable, 1, 1, true); // Max 1 row per patch
|
||||
dmat->InitColAccess(1, true); // Max 1 row per patch
|
||||
ASSERT_EQ(dmat->HaveColAccess(true), true);
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.col.page"));
|
||||
|
||||
@@ -67,31 +68,19 @@ TEST(SparsePageDMatrix, ColAcess) {
|
||||
EXPECT_EQ(dmat->GetColDensity(0), 1);
|
||||
EXPECT_EQ(dmat->GetColDensity(1), 0.5);
|
||||
|
||||
dmlc::DataIter<xgboost::ColBatch> * col_iter = dmat->ColIterator();
|
||||
auto col_iter = dmat->ColIterator();
|
||||
// Loop over the batches and assert the data is as expected
|
||||
long num_col_batch = 0;
|
||||
col_iter->BeforeFirst();
|
||||
while (col_iter->Next()) {
|
||||
num_col_batch += 1;
|
||||
EXPECT_EQ(col_iter->Value().size, dmat->Info().num_col_)
|
||||
EXPECT_EQ(col_iter->Value().Size(), dmat->Info().num_col_)
|
||||
<< "Expected batch size to be same as num_cols as max_row_perbatch is 1.";
|
||||
}
|
||||
EXPECT_EQ(num_col_batch, dmat->Info().num_row_)
|
||||
<< "Expected num batches to be same as num_rows as max_row_perbatch is 1";
|
||||
col_iter = nullptr;
|
||||
|
||||
std::vector<xgboost::bst_uint> sub_feats = {4, 3};
|
||||
dmlc::DataIter<xgboost::ColBatch> * sub_col_iter = dmat->ColIterator(sub_feats);
|
||||
// Loop over the batches and assert the data is as expected
|
||||
sub_col_iter->BeforeFirst();
|
||||
while (sub_col_iter->Next()) {
|
||||
EXPECT_EQ(sub_col_iter->Value().size, sub_feats.size())
|
||||
<< "Expected size of a batch to be same as number of columns "
|
||||
<< "as max_row_perbatch was set to 1.";
|
||||
}
|
||||
sub_col_iter = nullptr;
|
||||
|
||||
// Clean up of external memory files
|
||||
std::remove((tmp_file + ".cache").c_str());
|
||||
std::remove((tmp_file + ".cache.col.page").c_str());
|
||||
std::remove((tmp_file + ".cache.row.page").c_str());
|
||||
|
||||
@@ -3,7 +3,13 @@
|
||||
#include <random>
|
||||
|
||||
std::string TempFileName() {
|
||||
return std::tmpnam(nullptr);
|
||||
std::string tmp = std::tmpnam(nullptr);
|
||||
std::replace(tmp.begin(), tmp.end(), '\\',
|
||||
'/'); // Remove windows backslashes
|
||||
// Remove drive prefix for windows
|
||||
if (tmp.find("C:") != std::string::npos)
|
||||
tmp.erase(tmp.find("C:"), 2);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
bool FileExists(const std::string name) {
|
||||
|
||||
@@ -9,7 +9,7 @@ TEST(Linear, shotgun) {
|
||||
typedef std::pair<std::string, std::string> arg;
|
||||
auto mat = CreateDMatrix(10, 10, 0);
|
||||
std::vector<bool> enabled(mat->Info().num_col_, true);
|
||||
mat->InitColAccess(enabled, 1.0f, 1 << 16, false);
|
||||
mat->InitColAccess(1 << 16, false);
|
||||
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
|
||||
xgboost::LinearUpdater::Create("shotgun"));
|
||||
updater->Init({{"eta", "1."}});
|
||||
@@ -28,7 +28,7 @@ TEST(Linear, coordinate) {
|
||||
typedef std::pair<std::string, std::string> arg;
|
||||
auto mat = CreateDMatrix(10, 10, 0);
|
||||
std::vector<bool> enabled(mat->Info().num_col_, true);
|
||||
mat->InitColAccess(enabled, 1.0f, 1 << 16, false);
|
||||
mat->InitColAccess(1 << 16, false);
|
||||
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
|
||||
xgboost::LinearUpdater::Create("coord_descent"));
|
||||
updater->Init({{"eta", "1."}});
|
||||
|
||||
@@ -33,7 +33,7 @@ TEST(cpu_predictor, Test) {
|
||||
|
||||
// Test predict instance
|
||||
auto batch = dmat->RowIterator()->Value();
|
||||
for (int i = 0; i < batch.size; i++) {
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
std::vector<float> instance_out_predictions;
|
||||
cpu_predictor->PredictInstance(batch[i], &instance_out_predictions, model);
|
||||
ASSERT_EQ(instance_out_predictions[0], 1.5);
|
||||
|
||||
@@ -46,7 +46,7 @@ TEST(gpu_predictor, Test) {
|
||||
}
|
||||
// Test predict instance
|
||||
auto batch = dmat->RowIterator()->Value();
|
||||
for (int i = 0; i < batch.size; i++) {
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
std::vector<float> gpu_instance_out_predictions;
|
||||
std::vector<float> cpu_instance_out_predictions;
|
||||
cpu_predictor->PredictInstance(batch[i], &cpu_instance_out_predictions,
|
||||
|
||||
@@ -26,10 +26,10 @@ TEST(gpu_hist_experimental, TestSparseShard) {
|
||||
TrainParam p;
|
||||
p.max_depth = 6;
|
||||
|
||||
dmlc::DataIter<RowBatch>* iter = dmat->RowIterator();
|
||||
dmlc::DataIter<SparsePage>* iter = dmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
const RowBatch& batch = iter->Value();
|
||||
const SparsePage& batch = iter->Value();
|
||||
DeviceShard shard(0, 0, 0, rows, hmat.row_ptr.back(), p);
|
||||
shard.Init(hmat, batch);
|
||||
CHECK(!iter->Next());
|
||||
@@ -67,10 +67,10 @@ TEST(gpu_hist_experimental, TestDenseShard) {
|
||||
TrainParam p;
|
||||
p.max_depth = 6;
|
||||
|
||||
dmlc::DataIter<RowBatch>* iter = dmat->RowIterator();
|
||||
dmlc::DataIter<SparsePage>* iter = dmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
const RowBatch& batch = iter->Value();
|
||||
const SparsePage& batch = iter->Value();
|
||||
|
||||
DeviceShard shard(0, 0, 0, rows, hmat.row_ptr.back(), p);
|
||||
shard.Init(hmat, batch);
|
||||
|
||||
@@ -16,15 +16,16 @@ if [ ${TASK} == "lint" ]; then
|
||||
cp "$file" "${file/.cu/_tmp.cc}"
|
||||
done
|
||||
|
||||
echo "Running clang tidy..."
|
||||
header_filter='(xgboost\/src|xgboost\/include)'
|
||||
for filename in $(find src -name '*.cc'); do
|
||||
clang-tidy $filename -header-filter=$header_filter -- -Iinclude -Idmlc-core/include -Irabit/include -std=c++11 >> logtidy.txt
|
||||
done
|
||||
echo "---------clang-tidy log----------"
|
||||
cat logtidy.txt
|
||||
echo "----------------------------"
|
||||
|
||||
echo "---------clang-tidy failures----------"
|
||||
# Fail only on warnings related to XGBoost source files
|
||||
(cat logtidy.txt|grep -E 'dmlc/xgboost.*warning'|grep -v dmlc-core) && exit -1
|
||||
(cat logtidy.txt|grep -E 'xgboost.*warning'|grep -v dmlc-core) && exit -1
|
||||
echo "----------------------------"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user