From 45876bf41bc48cc54e01f7fc327b066c2e060011 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sun, 30 Jun 2019 09:56:49 +0800 Subject: [PATCH] Fix external memory for get column batches. (#4622) * Fix external memory for get column batches. This fixes two bugs: * Use PushCSC for get column batches. * Don't remove the created temporary directory before finishing test. * Check all pages. --- include/xgboost/data.h | 12 +----------- src/data/data.cc | 12 ++++++++++++ src/data/simple_dmatrix.h | 2 +- src/data/sparse_page_dmatrix.h | 2 +- src/data/sparse_page_source.cc | 14 +++++++------- src/data/sparse_page_writer.cc | 2 +- tests/cpp/common/test_column_matrix.cc | 12 ++++++++---- tests/cpp/common/test_gpu_hist_util.cu | 12 ++++++++---- tests/cpp/data/test_data.cc | 8 ++++++-- tests/cpp/data/test_sparse_page_dmatrix.cc | 22 +++++++++++++++++++++- tests/cpp/helpers.cc | 17 +++++++++-------- tests/cpp/helpers.h | 3 ++- tests/cpp/predictor/test_cpu_predictor.cc | 4 +++- tests/cpp/predictor/test_gpu_predictor.cu | 14 ++++++++++---- 14 files changed, 90 insertions(+), 46 deletions(-) diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 28ff1ba2b..311347eaa 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -265,17 +265,7 @@ class SparsePage { * \brief Push one instance into page * \param inst an instance row */ - inline void Push(const Inst &inst) { - auto& data_vec = data.HostVector(); - auto& offset_vec = offset.HostVector(); - offset_vec.push_back(offset_vec.back() + inst.size()); - size_t begin = data_vec.size(); - data_vec.resize(begin + inst.size()); - if (inst.size() != 0) { - std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(), - sizeof(Entry) * inst.size()); - } - } + void Push(const Inst &inst); size_t Size() { return offset.Size() - 1; } }; diff --git a/src/data/data.cc b/src/data/data.cc index d52ff4110..dc013c767 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -412,6 +412,18 @@ void SparsePage::PushCSC(const SparsePage &batch) { self_offset = std::move(offset); } +void SparsePage::Push(const Inst &inst) { + auto& data_vec = data.HostVector(); + auto& offset_vec = offset.HostVector(); + offset_vec.push_back(offset_vec.back() + inst.size()); + size_t begin = data_vec.size(); + data_vec.resize(begin + inst.size()); + if (inst.size() != 0) { + std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(), + sizeof(Entry) * inst.size()); + } +} + namespace data { // List of files that will be force linked in static links. DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format); diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h index 34b6b2c19..c32dac38d 100644 --- a/src/data/simple_dmatrix.h +++ b/src/data/simple_dmatrix.h @@ -20,7 +20,7 @@ namespace xgboost { namespace data { - +// Used for single batch data. class SimpleDMatrix : public DMatrix { public: explicit SimpleDMatrix(std::unique_ptr&& source) diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h index 420c355d1..cc6514bd4 100644 --- a/src/data/sparse_page_dmatrix.h +++ b/src/data/sparse_page_dmatrix.h @@ -18,7 +18,7 @@ namespace xgboost { namespace data { - +// Used for external memory. class SparsePageDMatrix : public DMatrix { public: explicit SparsePageDMatrix(std::unique_ptr&& source, diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc index 34007fc73..eac29e96c 100644 --- a/src/data/sparse_page_source.cc +++ b/src/data/sparse_page_source.cc @@ -221,8 +221,8 @@ void SparsePageSource::CreateRowPage(dmlc::Parser* src, CHECK(info.qids_.empty() || info.qids_.size() == info.num_row_); info.SaveBinary(fo.get()); } - LOG(CONSOLE) << "SparsePageSource::CreateRowPage Finished writing to " - << name_info; + LOG(INFO) << "SparsePageSource::CreateRowPage Finished writing to " + << name_info; } void SparsePageSource::CreatePageFromDMatrix(DMatrix* src, @@ -251,7 +251,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src, if (page_type == ".row.page") { page->Push(batch); } else if (page_type == ".col.page") { - page->Push(batch.GetTranspose(src->Info().num_col_)); + page->PushCSC(batch.GetTranspose(src->Info().num_col_)); } else if (page_type == ".sorted.col.page") { SparsePage tmp = batch.GetTranspose(src->Info().num_col_); page->PushCSC(tmp); @@ -266,9 +266,9 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src, writer.Alloc(&page); page->Clear(); double tdiff = dmlc::GetTime() - tstart; - LOG(CONSOLE) << "Writing to " << cache_info << " in " - << ((bytes_write >> 20UL) / tdiff) << " MB/s, " - << (bytes_write >> 20UL) << " written"; + LOG(INFO) << "Writing to " << cache_info << " in " + << ((bytes_write >> 20UL) / tdiff) << " MB/s, " + << (bytes_write >> 20UL) << " written"; } } if (page->data.Size() != 0) { @@ -281,7 +281,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src, fo->Write(&tmagic, sizeof(tmagic)); info.SaveBinary(fo.get()); } - LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info; + LOG(INFO) << "SparsePageSource: Finished writing to " << name_info; } void SparsePageSource::CreateRowPage(DMatrix* src, diff --git a/src/data/sparse_page_writer.cc b/src/data/sparse_page_writer.cc index 7b642dd95..a78ced8a5 100644 --- a/src/data/sparse_page_writer.cc +++ b/src/data/sparse_page_writer.cc @@ -39,7 +39,7 @@ SparsePageWriter::SparsePageWriter( qrecycle_.Push(std::move(page)); } fo.reset(nullptr); - LOG(CONSOLE) << "SparsePage::Writer Finished writing to " << name_shard; + LOG(INFO) << "SparsePage::Writer Finished writing to " << name_shard; })); } } diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc index 83cbefe57..25f2688b2 100644 --- a/tests/cpp/common/test_column_matrix.cc +++ b/tests/cpp/common/test_column_matrix.cc @@ -1,6 +1,9 @@ +#include +#include + #include "../../../src/common/column_matrix.h" #include "../helpers.h" -#include "gtest/gtest.h" + namespace xgboost { namespace common { @@ -51,10 +54,11 @@ TEST(DenseColumnWithMissing, Test) { delete dmat; } -void -TestGHistIndexMatrixCreation(size_t nthreads) { +void TestGHistIndexMatrixCreation(size_t nthreads) { + dmlc::TemporaryDirectory tmpdir; + std::string filename = tmpdir.path + "/big.libsvm"; /* This should create multiple sparse pages */ - std::unique_ptr dmat = CreateSparsePageDMatrix(1024, 1024); + std::unique_ptr dmat{ CreateSparsePageDMatrix(1024, 1024, filename) }; omp_set_num_threads(nthreads); GHistIndexMatrix gmat; gmat.Init(dmat.get(), 256); diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu index 8a1b3ed59..4ade9ab80 100644 --- a/tests/cpp/common/test_gpu_hist_util.cu +++ b/tests/cpp/common/test_gpu_hist_util.cu @@ -1,7 +1,9 @@ +#include +#include + #include #include -#include "gtest/gtest.h" #include #include @@ -22,10 +24,12 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) { std::shared_ptr *dmat = nullptr; size_t num_cols = 1; + dmlc::TemporaryDirectory tmpdir; + std::string file = tmpdir.path + "/big.libsvm"; if (use_external_memory) { - auto sp_dmat = CreateSparsePageDMatrix(nrows * 3, 128UL); // 3 entries/row - dmat = new std::shared_ptr(std::move(sp_dmat)); - num_cols = 5; + auto sp_dmat = CreateSparsePageDMatrix(nrows * 3, 128UL, file); // 3 entries/row + dmat = new std::shared_ptr(std::move(sp_dmat)); + num_cols = 5; } else { std::vector test_data(nrows); auto count_iter = thrust::make_counting_iterator(0); diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc index 0858ef7a6..622351bbc 100644 --- a/tests/cpp/data/test_data.cc +++ b/tests/cpp/data/test_data.cc @@ -1,4 +1,5 @@ #include +#include #include #include "xgboost/data.h" @@ -55,8 +56,11 @@ TEST(SparsePage, PushCSC) { } TEST(SparsePage, PushCSCAfterTranspose) { + dmlc::TemporaryDirectory tmpdir; + std::string filename = tmpdir.path + "/big.libsvm"; const int n_entries = 9; - std::unique_ptr dmat = CreateSparsePageDMatrix(n_entries, 64UL); + std::unique_ptr dmat = + CreateSparsePageDMatrix(n_entries, 64UL, filename); const int ncols = dmat->Info().num_col_; SparsePage page; // Consolidated sparse page for (const auto &batch : dmat->GetRowBatches()) { @@ -70,7 +74,7 @@ TEST(SparsePage, PushCSCAfterTranspose) { // The feature value for a feature in each row should be identical, as that is // how the dmatrix has been created - for (int i = 0; i < page.Size(); ++i) { + for (size_t i = 0; i < page.Size(); ++i) { auto inst = page[i]; for (int j = 1; j < inst.size(); ++j) { ASSERT_EQ(inst[0].fvalue, inst[j].fvalue); diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index dd842a44f..9372872a1 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -1,4 +1,5 @@ // Copyright by Contributors +#include #include #include #include @@ -26,7 +27,10 @@ TEST(SparsePageDMatrix, MetaInfo) { } TEST(SparsePageDMatrix, RowAccess) { - std::unique_ptr dmat = xgboost::CreateSparsePageDMatrix(12, 64); + dmlc::TemporaryDirectory tmpdir; + std::string filename = tmpdir.path + "/big.libsvm"; + std::unique_ptr dmat = + xgboost::CreateSparsePageDMatrix(12, 64, filename); // Test the data read into the first row auto &batch = *dmat->GetRowBatches().begin(); @@ -67,3 +71,19 @@ TEST(SparsePageDMatrix, ColAccess) { delete dmat; } + +// Multi-batches access +TEST(SparsePageDMatrix, ColAccessBatches) { + dmlc::TemporaryDirectory tmpdir; + std::string filename = tmpdir.path + "/big.libsvm"; + // Create multiple sparse pages + std::unique_ptr dmat { + xgboost::CreateSparsePageDMatrix(1024, 1024, filename) + }; + auto n_threads = omp_get_max_threads(); + omp_set_num_threads(16); + for (auto const& page : dmat->GetColumnBatches()) { + ASSERT_EQ(dmat->Info().num_col_, page.Size()); + } + omp_set_num_threads(n_threads); +} diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 514f39010..52aac2337 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -1,11 +1,13 @@ /*! * Copyright 2016-2018 XGBoost contributors */ -#include "./helpers.h" -#include "xgboost/c_api.h" +#include +#include #include #include -#include +#include "./helpers.h" +#include "xgboost/c_api.h" + #include "../../src/data/simple_csr_source.h" bool FileExists(const std::string& filename) { @@ -144,13 +146,12 @@ std::shared_ptr* CreateDMatrix(int rows, int columns, return static_cast *>(handle); } -std::unique_ptr CreateSparsePageDMatrix(size_t n_entries, size_t page_size) { +std::unique_ptr CreateSparsePageDMatrix( + size_t n_entries, size_t page_size, std::string tmp_file) { // Create sufficiently large data to make two row pages - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/big.libsvm"; CreateBigTestData(tmp_file, n_entries); - std::unique_ptr dmat = std::unique_ptr(DMatrix::Load( - tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)); + std::unique_ptr dmat { DMatrix::Load( + tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)}; EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); // Loop over the batches and count the records diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 0c3c8e535..a2a9f5d68 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -163,7 +163,8 @@ class SimpleRealUniformDistribution { std::shared_ptr *CreateDMatrix(int rows, int columns, float sparsity, int seed = 0); -std::unique_ptr CreateSparsePageDMatrix(size_t n_entries, size_t page_size); +std::unique_ptr CreateSparsePageDMatrix( + size_t n_entries, size_t page_size, std::string tmp_file); /** * \fn std::unique_ptr CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols, diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 34da550f8..0db5f66d0 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -56,7 +56,9 @@ TEST(cpu_predictor, Test) { } TEST(cpu_predictor, ExternalMemoryTest) { - std::unique_ptr dmat = CreateSparsePageDMatrix(12, 64); + dmlc::TemporaryDirectory tmpdir; + std::string filename = tmpdir.path + "/big.libsvm"; + std::unique_ptr dmat = CreateSparsePageDMatrix(12, 64, filename); auto lparam = CreateEmptyGenericParam(0, 0); std::unique_ptr cpu_predictor = std::unique_ptr(Predictor::Create("cpu_predictor", &lparam)); diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 2dffab8b6..751cff1c5 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -97,7 +97,9 @@ TEST(gpu_predictor, ExternalMemoryTest) { gbm::GBTreeModel model = CreateTestModel(); int n_col = 3; model.param.num_feature = n_col; - std::unique_ptr dmat = CreateSparsePageDMatrix(32, 64); + dmlc::TemporaryDirectory tmpdir; + std::string filename = tmpdir.path + "/big.libsvm"; + std::unique_ptr dmat = CreateSparsePageDMatrix(32, 64, filename); // Test predict batch HostDeviceVector out_predictions; @@ -268,9 +270,13 @@ TEST(gpu_predictor, MGPU_ExternalMemoryTest) { const int n_classes = 3; model.param.num_output_group = n_classes; std::vector> dmats; - dmats.push_back(CreateSparsePageDMatrix(9, 64UL)); - dmats.push_back(CreateSparsePageDMatrix(128, 128UL)); - dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL)); + dmlc::TemporaryDirectory tmpdir; + std::string file0 = tmpdir.path + "/big_0.libsvm"; + std::string file1 = tmpdir.path + "/big_1.libsvm"; + std::string file2 = tmpdir.path + "/big_2.libsvm"; + dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0)); + dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1)); + dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2)); for (const auto& dmat: dmats) { // Test predict batch