add a test for cpu predictor using external memory (#4308)
* add a test for cpu predictor using external memory * allow different page size for testing
This commit is contained in:
parent
b72eab3e07
commit
81c1cd40ca
@ -433,12 +433,14 @@ class DMatrix {
|
||||
* \param load_row_split Flag to read in part of rows, divided among the workers in distributed mode.
|
||||
* \param file_format The format type of the file, used for dmlc::Parser::Create.
|
||||
* By default "auto" will be able to load in both local binary file.
|
||||
* \param page_size Page size for external memory.
|
||||
* \return The created DMatrix.
|
||||
*/
|
||||
static DMatrix* Load(const std::string& uri,
|
||||
bool silent,
|
||||
bool load_row_split,
|
||||
const std::string& file_format = "auto");
|
||||
const std::string& file_format = "auto",
|
||||
const size_t page_size = kPageSize);
|
||||
/*!
|
||||
* \brief create a new DMatrix, by wrapping a row_iterator, and meta info.
|
||||
* \param source The source iterator of the data, the create function takes ownership of the source.
|
||||
@ -454,6 +456,7 @@ class DMatrix {
|
||||
* \param parser The input data parser
|
||||
* \param cache_prefix The path to prefix of temporary cache file of the DMatrix when used in external memory mode.
|
||||
* This can be nullptr for common cases, and in-memory mode will be used.
|
||||
* \param page_size Page size for external memory.
|
||||
* \sa dmlc::Parser
|
||||
* \note dmlc-core provides efficient distributed data parser for libsvm format.
|
||||
* User can create and register customized parser to load their own format using DMLC_REGISTER_DATA_PARSER.
|
||||
@ -461,7 +464,11 @@ class DMatrix {
|
||||
* \return A created DMatrix.
|
||||
*/
|
||||
static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
|
||||
const std::string& cache_prefix = "");
|
||||
const std::string& cache_prefix = "",
|
||||
const size_t page_size = kPageSize);
|
||||
|
||||
/*! \brief page size 32 MB */
|
||||
static const size_t kPageSize = 32UL << 20UL;
|
||||
};
|
||||
|
||||
// implementation of inline functions
|
||||
|
||||
@ -150,7 +150,8 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
|
||||
DMatrix* DMatrix::Load(const std::string& uri,
|
||||
bool silent,
|
||||
bool load_row_split,
|
||||
const std::string& file_format) {
|
||||
const std::string& file_format,
|
||||
const size_t page_size) {
|
||||
std::string fname, cache_file;
|
||||
size_t dlm_pos = uri.find('#');
|
||||
if (dlm_pos != std::string::npos) {
|
||||
@ -217,7 +218,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
|
||||
std::unique_ptr<dmlc::Parser<uint32_t> > parser(
|
||||
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
|
||||
DMatrix* dmat = DMatrix::Create(parser.get(), cache_file);
|
||||
DMatrix* dmat = DMatrix::Create(parser.get(), cache_file, page_size);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
@ -248,7 +249,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
}
|
||||
|
||||
DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
|
||||
const std::string& cache_prefix) {
|
||||
const std::string& cache_prefix,
|
||||
const size_t page_size) {
|
||||
if (cache_prefix.length() == 0) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
source->CopyFrom(parser);
|
||||
@ -256,7 +258,7 @@ DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
|
||||
} else {
|
||||
#if DMLC_ENABLE_STD_THREAD
|
||||
if (!data::SparsePageSource::CacheExist(cache_prefix, ".row.page")) {
|
||||
data::SparsePageSource::CreateRowPage(parser, cache_prefix);
|
||||
data::SparsePageSource::CreateRowPage(parser, cache_prefix, page_size);
|
||||
}
|
||||
std::unique_ptr<data::SparsePageSource> source(
|
||||
new data::SparsePageSource(cache_prefix, ".row.page"));
|
||||
|
||||
@ -40,9 +40,6 @@ class SparsePageDMatrix : public DMatrix {
|
||||
bool SingleColBlock() const override;
|
||||
|
||||
private:
|
||||
/*! \brief page size 256 MB */
|
||||
static const size_t kPageSize = 256UL << 20UL;
|
||||
|
||||
// source data pointers.
|
||||
std::unique_ptr<DataSource> row_source_;
|
||||
std::unique_ptr<SparsePageSource> column_source_;
|
||||
|
||||
@ -126,7 +126,8 @@ bool SparsePageSource::CacheExist(const std::string& cache_info,
|
||||
}
|
||||
|
||||
void SparsePageSource::CreateRowPage(dmlc::Parser<uint32_t>* src,
|
||||
const std::string& cache_info) {
|
||||
const std::string& cache_info,
|
||||
const size_t page_size) {
|
||||
const std::string page_type = ".row.page";
|
||||
std::vector<std::string> cache_shards = GetCacheShards(cache_info);
|
||||
CHECK_NE(cache_shards.size(), 0U);
|
||||
@ -183,7 +184,7 @@ void SparsePageSource::CreateRowPage(dmlc::Parser<uint32_t>* src,
|
||||
static_cast<uint64_t>(index + 1));
|
||||
}
|
||||
page->Push(batch);
|
||||
if (page->MemCostBytes() >= kPageSize) {
|
||||
if (page->MemCostBytes() >= page_size) {
|
||||
bytes_write += page->MemCostBytes();
|
||||
writer.PushWrite(std::move(page));
|
||||
writer.Alloc(&page);
|
||||
@ -222,7 +223,8 @@ void SparsePageSource::CreateRowPage(dmlc::Parser<uint32_t>* src,
|
||||
|
||||
void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||
const std::string& cache_info,
|
||||
const std::string& page_type) {
|
||||
const std::string& page_type,
|
||||
const size_t page_size) {
|
||||
std::vector<std::string> cache_shards = GetCacheShards(cache_info);
|
||||
CHECK_NE(cache_shards.size(), 0U);
|
||||
// read in the info files.
|
||||
@ -254,7 +256,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||
LOG(FATAL) << "Unknown page type: " << page_type;
|
||||
}
|
||||
|
||||
if (page->MemCostBytes() >= kPageSize) {
|
||||
if (page->MemCostBytes() >= page_size) {
|
||||
bytes_write += page->MemCostBytes();
|
||||
writer.PushWrite(std::move(page));
|
||||
writer.Alloc(&page);
|
||||
|
||||
@ -48,9 +48,11 @@ class SparsePageSource : public DataSource {
|
||||
* \brief Create source by taking data from parser.
|
||||
* \param src source parser.
|
||||
* \param cache_info The cache_info of cache file location.
|
||||
* \param page_size Page size for external memory.
|
||||
*/
|
||||
static void CreateRowPage(dmlc::Parser<uint32_t>* src,
|
||||
const std::string& cache_info);
|
||||
const std::string& cache_info,
|
||||
const size_t page_size = DMatrix::kPageSize);
|
||||
/*!
|
||||
* \brief Create source cache by copy content from DMatrix.
|
||||
* \param cache_info The cache_info of cache file location.
|
||||
@ -73,14 +75,13 @@ class SparsePageSource : public DataSource {
|
||||
*/
|
||||
static bool CacheExist(const std::string& cache_info,
|
||||
const std::string& page_type);
|
||||
/*! \brief page size 32 MB */
|
||||
static const size_t kPageSize = 32UL << 20UL;
|
||||
/*! \brief magic number used to identify Page */
|
||||
static const int kMagic = 0xffffab02;
|
||||
|
||||
private:
|
||||
static void CreatePageFromDMatrix(DMatrix* src, const std::string& cache_info,
|
||||
const std::string& page_type);
|
||||
const std::string& page_type,
|
||||
const size_t page_size = DMatrix::kPageSize);
|
||||
/*! \brief number of rows */
|
||||
size_t base_rowid_;
|
||||
/*! \brief page currently on hold. */
|
||||
|
||||
@ -29,16 +29,19 @@ TEST(SparsePageDMatrix, RowAccess) {
|
||||
// Create sufficiently large data to make two row pages
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
||||
CreateBigTestData(tmp_file, 5000000);
|
||||
CreateBigTestData(tmp_file, 12);
|
||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 64UL);
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
|
||||
// Loop over the batches and count the records
|
||||
int64_t batch_count = 0;
|
||||
int64_t row_count = 0;
|
||||
for (auto &batch : dmat->GetRowBatches()) {
|
||||
for (const auto &batch : dmat->GetRowBatches()) {
|
||||
batch_count++;
|
||||
row_count += batch.Size();
|
||||
}
|
||||
EXPECT_EQ(batch_count, 2);
|
||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||
|
||||
// Test the data read into the first row
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
// Copyright by Contributors
|
||||
#include <dmlc/filesystem.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/predictor.h>
|
||||
#include "../helpers.h"
|
||||
@ -59,4 +60,66 @@ TEST(cpu_predictor, Test) {
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
TEST(cpu_predictor, ExternalMemoryTest) {
|
||||
// Create sufficiently large data to make two row pages
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
||||
CreateBigTestData(tmp_file, 12);
|
||||
xgboost::DMatrix *dmat = xgboost::DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", 64UL);
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
int64_t batche_count = 0;
|
||||
for (const auto &batch : dmat->GetRowBatches()) {
|
||||
batche_count++;
|
||||
}
|
||||
EXPECT_EQ(batche_count, 2);
|
||||
|
||||
std::unique_ptr<Predictor> cpu_predictor =
|
||||
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor"));
|
||||
|
||||
std::vector<std::unique_ptr<RegTree>> trees;
|
||||
trees.push_back(std::unique_ptr<RegTree>(new RegTree));
|
||||
(*trees.back())[0].SetLeaf(1.5f);
|
||||
(*trees.back()).Stat(0).sum_hess = 1.0f;
|
||||
gbm::GBTreeModel model(0.5);
|
||||
model.CommitModel(std::move(trees), 0);
|
||||
model.param.num_output_group = 1;
|
||||
model.base_margin = 0;
|
||||
|
||||
// Test predict batch
|
||||
HostDeviceVector<float> out_predictions;
|
||||
cpu_predictor->PredictBatch(dmat, &out_predictions, model, 0);
|
||||
std::vector<float> &out_predictions_h = out_predictions.HostVector();
|
||||
EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_);
|
||||
for (const auto& v : out_predictions_h) {
|
||||
ASSERT_EQ(v, 1.5);
|
||||
}
|
||||
|
||||
// Test predict leaf
|
||||
std::vector<float> leaf_out_predictions;
|
||||
cpu_predictor->PredictLeaf(dmat, &leaf_out_predictions, model);
|
||||
EXPECT_EQ(leaf_out_predictions.size(), dmat->Info().num_row_);
|
||||
for (const auto& v : leaf_out_predictions) {
|
||||
ASSERT_EQ(v, 0);
|
||||
}
|
||||
|
||||
// Test predict contribution
|
||||
std::vector<float> out_contribution;
|
||||
cpu_predictor->PredictContribution(dmat, &out_contribution, model);
|
||||
EXPECT_EQ(out_contribution.size(), dmat->Info().num_row_);
|
||||
for (const auto& v : out_contribution) {
|
||||
ASSERT_EQ(v, 1.5);
|
||||
}
|
||||
|
||||
// Test predict contribution (approximate method)
|
||||
std::vector<float> out_contribution_approximate;
|
||||
cpu_predictor->PredictContribution(dmat, &out_contribution_approximate, model, true);
|
||||
EXPECT_EQ(out_contribution_approximate.size(), dmat->Info().num_row_);
|
||||
for (const auto& v : out_contribution_approximate) {
|
||||
ASSERT_EQ(v, 1.5);
|
||||
}
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user