Fix external memory for get column batches. (#4622)
* Fix external memory for get column batches. This fixes two bugs: * Use PushCSC for get column batches. * Don't remove the created temporary directory before finishing test. * Check all pages.
This commit is contained in:
parent
a30176907f
commit
45876bf41b
@ -265,17 +265,7 @@ class SparsePage {
|
|||||||
* \brief Push one instance into page
|
* \brief Push one instance into page
|
||||||
* \param inst an instance row
|
* \param inst an instance row
|
||||||
*/
|
*/
|
||||||
inline void Push(const Inst &inst) {
|
void Push(const Inst &inst);
|
||||||
auto& data_vec = data.HostVector();
|
|
||||||
auto& offset_vec = offset.HostVector();
|
|
||||||
offset_vec.push_back(offset_vec.back() + inst.size());
|
|
||||||
size_t begin = data_vec.size();
|
|
||||||
data_vec.resize(begin + inst.size());
|
|
||||||
if (inst.size() != 0) {
|
|
||||||
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
|
|
||||||
sizeof(Entry) * inst.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t Size() { return offset.Size() - 1; }
|
size_t Size() { return offset.Size() - 1; }
|
||||||
};
|
};
|
||||||
|
|||||||
@ -412,6 +412,18 @@ void SparsePage::PushCSC(const SparsePage &batch) {
|
|||||||
self_offset = std::move(offset);
|
self_offset = std::move(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SparsePage::Push(const Inst &inst) {
|
||||||
|
auto& data_vec = data.HostVector();
|
||||||
|
auto& offset_vec = offset.HostVector();
|
||||||
|
offset_vec.push_back(offset_vec.back() + inst.size());
|
||||||
|
size_t begin = data_vec.size();
|
||||||
|
data_vec.resize(begin + inst.size());
|
||||||
|
if (inst.size() != 0) {
|
||||||
|
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
|
||||||
|
sizeof(Entry) * inst.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
namespace data {
|
namespace data {
|
||||||
// List of files that will be force linked in static links.
|
// List of files that will be force linked in static links.
|
||||||
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
|
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
// Used for single batch data.
|
||||||
class SimpleDMatrix : public DMatrix {
|
class SimpleDMatrix : public DMatrix {
|
||||||
public:
|
public:
|
||||||
explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
|
explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
|
||||||
|
|||||||
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
// Used for external memory.
|
||||||
class SparsePageDMatrix : public DMatrix {
|
class SparsePageDMatrix : public DMatrix {
|
||||||
public:
|
public:
|
||||||
explicit SparsePageDMatrix(std::unique_ptr<DataSource>&& source,
|
explicit SparsePageDMatrix(std::unique_ptr<DataSource>&& source,
|
||||||
|
|||||||
@ -221,8 +221,8 @@ void SparsePageSource::CreateRowPage(dmlc::Parser<uint32_t>* src,
|
|||||||
CHECK(info.qids_.empty() || info.qids_.size() == info.num_row_);
|
CHECK(info.qids_.empty() || info.qids_.size() == info.num_row_);
|
||||||
info.SaveBinary(fo.get());
|
info.SaveBinary(fo.get());
|
||||||
}
|
}
|
||||||
LOG(CONSOLE) << "SparsePageSource::CreateRowPage Finished writing to "
|
LOG(INFO) << "SparsePageSource::CreateRowPage Finished writing to "
|
||||||
<< name_info;
|
<< name_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
||||||
@ -251,7 +251,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
|||||||
if (page_type == ".row.page") {
|
if (page_type == ".row.page") {
|
||||||
page->Push(batch);
|
page->Push(batch);
|
||||||
} else if (page_type == ".col.page") {
|
} else if (page_type == ".col.page") {
|
||||||
page->Push(batch.GetTranspose(src->Info().num_col_));
|
page->PushCSC(batch.GetTranspose(src->Info().num_col_));
|
||||||
} else if (page_type == ".sorted.col.page") {
|
} else if (page_type == ".sorted.col.page") {
|
||||||
SparsePage tmp = batch.GetTranspose(src->Info().num_col_);
|
SparsePage tmp = batch.GetTranspose(src->Info().num_col_);
|
||||||
page->PushCSC(tmp);
|
page->PushCSC(tmp);
|
||||||
@ -266,9 +266,9 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
|||||||
writer.Alloc(&page);
|
writer.Alloc(&page);
|
||||||
page->Clear();
|
page->Clear();
|
||||||
double tdiff = dmlc::GetTime() - tstart;
|
double tdiff = dmlc::GetTime() - tstart;
|
||||||
LOG(CONSOLE) << "Writing to " << cache_info << " in "
|
LOG(INFO) << "Writing to " << cache_info << " in "
|
||||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||||
<< (bytes_write >> 20UL) << " written";
|
<< (bytes_write >> 20UL) << " written";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (page->data.Size() != 0) {
|
if (page->data.Size() != 0) {
|
||||||
@ -281,7 +281,7 @@ void SparsePageSource::CreatePageFromDMatrix(DMatrix* src,
|
|||||||
fo->Write(&tmagic, sizeof(tmagic));
|
fo->Write(&tmagic, sizeof(tmagic));
|
||||||
info.SaveBinary(fo.get());
|
info.SaveBinary(fo.get());
|
||||||
}
|
}
|
||||||
LOG(CONSOLE) << "SparsePageSource: Finished writing to " << name_info;
|
LOG(INFO) << "SparsePageSource: Finished writing to " << name_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SparsePageSource::CreateRowPage(DMatrix* src,
|
void SparsePageSource::CreateRowPage(DMatrix* src,
|
||||||
|
|||||||
@ -39,7 +39,7 @@ SparsePageWriter::SparsePageWriter(
|
|||||||
qrecycle_.Push(std::move(page));
|
qrecycle_.Push(std::move(page));
|
||||||
}
|
}
|
||||||
fo.reset(nullptr);
|
fo.reset(nullptr);
|
||||||
LOG(CONSOLE) << "SparsePage::Writer Finished writing to " << name_shard;
|
LOG(INFO) << "SparsePage::Writer Finished writing to " << name_shard;
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
|
#include <dmlc/filesystem.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
#include "../../../src/common/column_matrix.h"
|
#include "../../../src/common/column_matrix.h"
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
@ -51,10 +54,11 @@ TEST(DenseColumnWithMissing, Test) {
|
|||||||
delete dmat;
|
delete dmat;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void TestGHistIndexMatrixCreation(size_t nthreads) {
|
||||||
TestGHistIndexMatrixCreation(size_t nthreads) {
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
/* This should create multiple sparse pages */
|
/* This should create multiple sparse pages */
|
||||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(1024, 1024);
|
std::unique_ptr<DMatrix> dmat{ CreateSparsePageDMatrix(1024, 1024, filename) };
|
||||||
omp_set_num_threads(nthreads);
|
omp_set_num_threads(nthreads);
|
||||||
GHistIndexMatrix gmat;
|
GHistIndexMatrix gmat;
|
||||||
gmat.Init(dmat.get(), 256);
|
gmat.Init(dmat.get(), 256);
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
|
#include <dmlc/filesystem.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
#include <thrust/device_vector.h>
|
#include <thrust/device_vector.h>
|
||||||
#include <thrust/iterator/counting_iterator.h>
|
#include <thrust/iterator/counting_iterator.h>
|
||||||
@ -22,10 +24,12 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) {
|
|||||||
std::shared_ptr<xgboost::DMatrix> *dmat = nullptr;
|
std::shared_ptr<xgboost::DMatrix> *dmat = nullptr;
|
||||||
|
|
||||||
size_t num_cols = 1;
|
size_t num_cols = 1;
|
||||||
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string file = tmpdir.path + "/big.libsvm";
|
||||||
if (use_external_memory) {
|
if (use_external_memory) {
|
||||||
auto sp_dmat = CreateSparsePageDMatrix(nrows * 3, 128UL); // 3 entries/row
|
auto sp_dmat = CreateSparsePageDMatrix(nrows * 3, 128UL, file); // 3 entries/row
|
||||||
dmat = new std::shared_ptr<xgboost::DMatrix>(std::move(sp_dmat));
|
dmat = new std::shared_ptr<xgboost::DMatrix>(std::move(sp_dmat));
|
||||||
num_cols = 5;
|
num_cols = 5;
|
||||||
} else {
|
} else {
|
||||||
std::vector<float> test_data(nrows);
|
std::vector<float> test_data(nrows);
|
||||||
auto count_iter = thrust::make_counting_iterator(0);
|
auto count_iter = thrust::make_counting_iterator(0);
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#include <dmlc/filesystem.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
@ -55,8 +56,11 @@ TEST(SparsePage, PushCSC) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePage, PushCSCAfterTranspose) {
|
TEST(SparsePage, PushCSCAfterTranspose) {
|
||||||
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
const int n_entries = 9;
|
const int n_entries = 9;
|
||||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(n_entries, 64UL);
|
std::unique_ptr<DMatrix> dmat =
|
||||||
|
CreateSparsePageDMatrix(n_entries, 64UL, filename);
|
||||||
const int ncols = dmat->Info().num_col_;
|
const int ncols = dmat->Info().num_col_;
|
||||||
SparsePage page; // Consolidated sparse page
|
SparsePage page; // Consolidated sparse page
|
||||||
for (const auto &batch : dmat->GetRowBatches()) {
|
for (const auto &batch : dmat->GetRowBatches()) {
|
||||||
@ -70,7 +74,7 @@ TEST(SparsePage, PushCSCAfterTranspose) {
|
|||||||
|
|
||||||
// The feature value for a feature in each row should be identical, as that is
|
// The feature value for a feature in each row should be identical, as that is
|
||||||
// how the dmatrix has been created
|
// how the dmatrix has been created
|
||||||
for (int i = 0; i < page.Size(); ++i) {
|
for (size_t i = 0; i < page.Size(); ++i) {
|
||||||
auto inst = page[i];
|
auto inst = page[i];
|
||||||
for (int j = 1; j < inst.size(); ++j) {
|
for (int j = 1; j < inst.size(); ++j) {
|
||||||
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);
|
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
// Copyright by Contributors
|
// Copyright by Contributors
|
||||||
|
#include <dmlc/filesystem.h>
|
||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
#include <dmlc/filesystem.h>
|
#include <dmlc/filesystem.h>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
@ -26,7 +27,10 @@ TEST(SparsePageDMatrix, MetaInfo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(SparsePageDMatrix, RowAccess) {
|
TEST(SparsePageDMatrix, RowAccess) {
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(12, 64);
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
|
std::unique_ptr<xgboost::DMatrix> dmat =
|
||||||
|
xgboost::CreateSparsePageDMatrix(12, 64, filename);
|
||||||
|
|
||||||
// Test the data read into the first row
|
// Test the data read into the first row
|
||||||
auto &batch = *dmat->GetRowBatches().begin();
|
auto &batch = *dmat->GetRowBatches().begin();
|
||||||
@ -67,3 +71,19 @@ TEST(SparsePageDMatrix, ColAccess) {
|
|||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Multi-batches access
|
||||||
|
TEST(SparsePageDMatrix, ColAccessBatches) {
|
||||||
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
|
// Create multiple sparse pages
|
||||||
|
std::unique_ptr<xgboost::DMatrix> dmat {
|
||||||
|
xgboost::CreateSparsePageDMatrix(1024, 1024, filename)
|
||||||
|
};
|
||||||
|
auto n_threads = omp_get_max_threads();
|
||||||
|
omp_set_num_threads(16);
|
||||||
|
for (auto const& page : dmat->GetColumnBatches()) {
|
||||||
|
ASSERT_EQ(dmat->Info().num_col_, page.Size());
|
||||||
|
}
|
||||||
|
omp_set_num_threads(n_threads);
|
||||||
|
}
|
||||||
|
|||||||
@ -1,11 +1,13 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2016-2018 XGBoost contributors
|
* Copyright 2016-2018 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include "./helpers.h"
|
#include <dmlc/filesystem.h>
|
||||||
#include "xgboost/c_api.h"
|
#include <xgboost/logging.h>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <dmlc/filesystem.h>
|
#include "./helpers.h"
|
||||||
|
#include "xgboost/c_api.h"
|
||||||
|
|
||||||
#include "../../src/data/simple_csr_source.h"
|
#include "../../src/data/simple_csr_source.h"
|
||||||
|
|
||||||
bool FileExists(const std::string& filename) {
|
bool FileExists(const std::string& filename) {
|
||||||
@ -144,13 +146,12 @@ std::shared_ptr<xgboost::DMatrix>* CreateDMatrix(int rows, int columns,
|
|||||||
return static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
|
return static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size) {
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||||
|
size_t n_entries, size_t page_size, std::string tmp_file) {
|
||||||
// Create sufficiently large data to make two row pages
|
// Create sufficiently large data to make two row pages
|
||||||
dmlc::TemporaryDirectory tempdir;
|
|
||||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
|
||||||
CreateBigTestData(tmp_file, n_entries);
|
CreateBigTestData(tmp_file, n_entries);
|
||||||
std::unique_ptr<DMatrix> dmat = std::unique_ptr<DMatrix>(DMatrix::Load(
|
std::unique_ptr<DMatrix> dmat { DMatrix::Load(
|
||||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
|
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
|
||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||||
|
|
||||||
// Loop over the batches and count the records
|
// Loop over the batches and count the records
|
||||||
|
|||||||
@ -163,7 +163,8 @@ class SimpleRealUniformDistribution {
|
|||||||
std::shared_ptr<xgboost::DMatrix> *CreateDMatrix(int rows, int columns,
|
std::shared_ptr<xgboost::DMatrix> *CreateDMatrix(int rows, int columns,
|
||||||
float sparsity, int seed = 0);
|
float sparsity, int seed = 0);
|
||||||
|
|
||||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size);
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||||
|
size_t n_entries, size_t page_size, std::string tmp_file);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
* \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
||||||
|
|||||||
@ -56,7 +56,9 @@ TEST(cpu_predictor, Test) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(cpu_predictor, ExternalMemoryTest) {
|
TEST(cpu_predictor, ExternalMemoryTest) {
|
||||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64);
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
|
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(12, 64, filename);
|
||||||
auto lparam = CreateEmptyGenericParam(0, 0);
|
auto lparam = CreateEmptyGenericParam(0, 0);
|
||||||
std::unique_ptr<Predictor> cpu_predictor =
|
std::unique_ptr<Predictor> cpu_predictor =
|
||||||
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
|
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
|
||||||
|
|||||||
@ -97,7 +97,9 @@ TEST(gpu_predictor, ExternalMemoryTest) {
|
|||||||
gbm::GBTreeModel model = CreateTestModel();
|
gbm::GBTreeModel model = CreateTestModel();
|
||||||
int n_col = 3;
|
int n_col = 3;
|
||||||
model.param.num_feature = n_col;
|
model.param.num_feature = n_col;
|
||||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(32, 64);
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
|
std::string filename = tmpdir.path + "/big.libsvm";
|
||||||
|
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(32, 64, filename);
|
||||||
|
|
||||||
// Test predict batch
|
// Test predict batch
|
||||||
HostDeviceVector<float> out_predictions;
|
HostDeviceVector<float> out_predictions;
|
||||||
@ -268,9 +270,13 @@ TEST(gpu_predictor, MGPU_ExternalMemoryTest) {
|
|||||||
const int n_classes = 3;
|
const int n_classes = 3;
|
||||||
model.param.num_output_group = n_classes;
|
model.param.num_output_group = n_classes;
|
||||||
std::vector<std::unique_ptr<DMatrix>> dmats;
|
std::vector<std::unique_ptr<DMatrix>> dmats;
|
||||||
dmats.push_back(CreateSparsePageDMatrix(9, 64UL));
|
dmlc::TemporaryDirectory tmpdir;
|
||||||
dmats.push_back(CreateSparsePageDMatrix(128, 128UL));
|
std::string file0 = tmpdir.path + "/big_0.libsvm";
|
||||||
dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL));
|
std::string file1 = tmpdir.path + "/big_1.libsvm";
|
||||||
|
std::string file2 = tmpdir.path + "/big_2.libsvm";
|
||||||
|
dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
|
||||||
|
dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
|
||||||
|
dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
|
||||||
|
|
||||||
for (const auto& dmat: dmats) {
|
for (const auto& dmat: dmats) {
|
||||||
// Test predict batch
|
// Test predict batch
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user