Write ELLPACK pages to disk (#4879)

* add ellpack source
* add batch param
* extract function to parse cache info
* construct ellpack info separately
* push batch to ellpack page
* write ellpack page.
* make sparse page source reusable
This commit is contained in:
Rong Ou
2019-10-22 20:44:32 -07:00
committed by Jiaming Yuan
parent 310fe60b35
commit 5b1715d97c
25 changed files with 935 additions and 408 deletions

View File

@@ -17,15 +17,13 @@ TEST(EllpackPage, EmptyDMatrix) {
constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256, kGpuBatchNRows = 64;
constexpr float kSparsity = 0;
auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity);
auto& page = *dmat->GetBatches<EllpackPage>().begin();
auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin, kGpuBatchNRows}).begin();
auto impl = page.Impl();
impl->Init(0, kMaxBin, kGpuBatchNRows);
ASSERT_EQ(impl->ellpack_matrix.feature_segments.size(), 1);
ASSERT_EQ(impl->ellpack_matrix.min_fvalue.size(), 0);
ASSERT_EQ(impl->ellpack_matrix.gidx_fvalue_map.size(), 0);
ASSERT_EQ(impl->ellpack_matrix.row_stride, 0);
ASSERT_EQ(impl->ellpack_matrix.null_gidx_value, 0);
ASSERT_EQ(impl->n_bins, 0);
ASSERT_EQ(impl->matrix.info.feature_segments.size(), 1);
ASSERT_EQ(impl->matrix.info.min_fvalue.size(), 0);
ASSERT_EQ(impl->matrix.info.gidx_fvalue_map.size(), 0);
ASSERT_EQ(impl->matrix.info.row_stride, 0);
ASSERT_EQ(impl->matrix.info.n_bins, 0);
ASSERT_EQ(impl->gidx_buffer.size(), 4);
}
@@ -37,7 +35,7 @@ TEST(EllpackPage, BuildGidxDense) {
dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
ASSERT_EQ(page->ellpack_matrix.row_stride, kNCols);
ASSERT_EQ(page->matrix.info.row_stride, kNCols);
std::vector<uint32_t> solution = {
0, 3, 8, 9, 14, 17, 20, 21,
@@ -70,7 +68,7 @@ TEST(EllpackPage, BuildGidxSparse) {
dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
ASSERT_LE(page->ellpack_matrix.row_stride, 3);
ASSERT_LE(page->matrix.info.row_stride, 3);
// row_stride = 3, 16 rows, 48 entries for ELLPack
std::vector<uint32_t> solution = {
@@ -78,7 +76,7 @@ TEST(EllpackPage, BuildGidxSparse) {
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
};
for (size_t i = 0; i < kNRows * page->ellpack_matrix.row_stride; ++i) {
for (size_t i = 0; i < kNRows * page->matrix.info.row_stride; ++i) {
ASSERT_EQ(solution[i], gidx[i]);
}
}

View File

@@ -0,0 +1,26 @@
// Copyright by Contributors
#include <dmlc/filesystem.h>
#include "../helpers.h"
namespace xgboost {
TEST(GPUSparsePageDMatrix, EllpackPage) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
CreateSimpleTestData(tmp_file);
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
// Loop over the batches and assert the data is as expected
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 64})) {
EXPECT_EQ(batch.Size(), dmat->Info().num_row_);
}
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page"));
delete dmat;
}
} // namespace xgboost

View File

@@ -192,14 +192,14 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
return dmat;
}
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
size_t page_size, bool deterministic) {
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
const dmlc::TemporaryDirectory& tempdir) {
if (!n_rows || !n_cols) {
return nullptr;
}
// Create the svm file in a temp dir
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/big.libsvm";
std::ofstream fo(tmp_file.c_str());

View File

@@ -14,6 +14,7 @@
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <xgboost/base.h>
#include <xgboost/objective.h>
#include <xgboost/metric.h>
@@ -199,8 +200,9 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
*
* \return The new dmatrix.
*/
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
size_t page_size, bool deterministic);
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
const dmlc::TemporaryDirectory& tempdir = dmlc::TemporaryDirectory());
gbm::GBTreeModel CreateTestModel();
@@ -247,16 +249,15 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
0.26f, 0.71f, 1.83f});
cmat.SetMins({0.1f, 0.2f, 0.3f, 0.1f, 0.2f, 0.3f, 0.2f, 0.2f});
auto is_dense = (*dmat)->Info().num_nonzero_ ==
(*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
size_t row_stride = 0;
const auto &offset_vec = batch.offset.ConstHostVector();
for (size_t i = 1; i < offset_vec.size(); ++i) {
row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
}
auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get()));
page->InitCompressedData(0, cmat, row_stride, is_dense);
auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get(), {0, 256, 0}));
page->InitInfo(0, (*dmat)->IsDense(), row_stride, cmat);
page->InitCompressedData(0, n_rows);
page->CreateHistIndices(0, batch, RowStateOnDevice(batch.Size(), batch.Size()));
delete dmat;

View File

@@ -2,6 +2,7 @@
* Copyright 2017-2019 XGBoost contributors
*/
#include <thrust/device_vector.h>
#include <dmlc/filesystem.h>
#include <xgboost/base.h>
#include <random>
#include <string>
@@ -207,14 +208,14 @@ TEST(GpuHist, EvaluateSplits) {
// Copy cut matrix to device.
maker.ba.Allocate(0,
&(page->ellpack_matrix.feature_segments), cmat.Ptrs().size(),
&(page->ellpack_matrix.min_fvalue), cmat.MinValues().size(),
&(page->ellpack_matrix.gidx_fvalue_map), 24,
&(page->matrix.info.feature_segments), cmat.Ptrs().size(),
&(page->matrix.info.min_fvalue), cmat.MinValues().size(),
&(page->matrix.info.gidx_fvalue_map), 24,
&(maker.monotone_constraints), kNCols);
dh::CopyVectorToDeviceSpan(page->ellpack_matrix.feature_segments, cmat.Ptrs());
dh::CopyVectorToDeviceSpan(page->ellpack_matrix.gidx_fvalue_map, cmat.Values());
dh::CopyVectorToDeviceSpan(page->matrix.info.feature_segments, cmat.Ptrs());
dh::CopyVectorToDeviceSpan(page->matrix.info.gidx_fvalue_map, cmat.Values());
dh::CopyVectorToDeviceSpan(maker.monotone_constraints, param.monotone_constraints);
dh::CopyVectorToDeviceSpan(page->ellpack_matrix.min_fvalue, cmat.MinValues());
dh::CopyVectorToDeviceSpan(page->matrix.info.min_fvalue, cmat.MinValues());
// Initialize GPUHistMakerDevice::hist
maker.hist.Init(0, (max_bins - 1) * kNCols);
@@ -265,8 +266,10 @@ void TestHistogramIndexImpl() {
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker, hist_maker_ext;
std::unique_ptr<DMatrix> hist_maker_dmat(
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 0, true));
dmlc::TemporaryDirectory tempdir;
std::unique_ptr<DMatrix> hist_maker_ext_dmat(
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true));
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true, tempdir));
std::vector<std::pair<std::string, std::string>> training_params = {
{"max_depth", "10"},
@@ -275,22 +278,21 @@ void TestHistogramIndexImpl() {
GenericParameter generic_param(CreateEmptyGenericParam(0));
hist_maker.Configure(training_params, &generic_param);
hist_maker.InitDataOnce(hist_maker_dmat.get());
hist_maker_ext.Configure(training_params, &generic_param);
hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
// Extract the device maker from the histogram makers and from that its compressed
// histogram index
const auto &maker = hist_maker.maker_;
const auto &maker = hist_maker.maker;
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.size());
dh::CopyDeviceSpanToVector(&h_gidx_buffer, maker->page->gidx_buffer);
const auto &maker_ext = hist_maker_ext.maker_;
const auto &maker_ext = hist_maker_ext.maker;
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.size());
dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, maker_ext->page->gidx_buffer);
ASSERT_EQ(maker->page->n_bins, maker_ext->page->n_bins);
ASSERT_EQ(maker->page->matrix.info.n_bins, maker_ext->page->matrix.info.n_bins);
ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);