Write ELLPACK pages to disk (#4879)
* add ellpack source * add batch param * extract function to parse cache info * construct ellpack info separately * push batch to ellpack page * write ellpack page. * make sparse page source reusable
This commit is contained in:
@@ -17,15 +17,13 @@ TEST(EllpackPage, EmptyDMatrix) {
|
||||
constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256, kGpuBatchNRows = 64;
|
||||
constexpr float kSparsity = 0;
|
||||
auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity);
|
||||
auto& page = *dmat->GetBatches<EllpackPage>().begin();
|
||||
auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin, kGpuBatchNRows}).begin();
|
||||
auto impl = page.Impl();
|
||||
impl->Init(0, kMaxBin, kGpuBatchNRows);
|
||||
ASSERT_EQ(impl->ellpack_matrix.feature_segments.size(), 1);
|
||||
ASSERT_EQ(impl->ellpack_matrix.min_fvalue.size(), 0);
|
||||
ASSERT_EQ(impl->ellpack_matrix.gidx_fvalue_map.size(), 0);
|
||||
ASSERT_EQ(impl->ellpack_matrix.row_stride, 0);
|
||||
ASSERT_EQ(impl->ellpack_matrix.null_gidx_value, 0);
|
||||
ASSERT_EQ(impl->n_bins, 0);
|
||||
ASSERT_EQ(impl->matrix.info.feature_segments.size(), 1);
|
||||
ASSERT_EQ(impl->matrix.info.min_fvalue.size(), 0);
|
||||
ASSERT_EQ(impl->matrix.info.gidx_fvalue_map.size(), 0);
|
||||
ASSERT_EQ(impl->matrix.info.row_stride, 0);
|
||||
ASSERT_EQ(impl->matrix.info.n_bins, 0);
|
||||
ASSERT_EQ(impl->gidx_buffer.size(), 4);
|
||||
}
|
||||
|
||||
@@ -37,7 +35,7 @@ TEST(EllpackPage, BuildGidxDense) {
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
|
||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
||||
|
||||
ASSERT_EQ(page->ellpack_matrix.row_stride, kNCols);
|
||||
ASSERT_EQ(page->matrix.info.row_stride, kNCols);
|
||||
|
||||
std::vector<uint32_t> solution = {
|
||||
0, 3, 8, 9, 14, 17, 20, 21,
|
||||
@@ -70,7 +68,7 @@ TEST(EllpackPage, BuildGidxSparse) {
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
|
||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
||||
|
||||
ASSERT_LE(page->ellpack_matrix.row_stride, 3);
|
||||
ASSERT_LE(page->matrix.info.row_stride, 3);
|
||||
|
||||
// row_stride = 3, 16 rows, 48 entries for ELLPack
|
||||
std::vector<uint32_t> solution = {
|
||||
@@ -78,7 +76,7 @@ TEST(EllpackPage, BuildGidxSparse) {
|
||||
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
||||
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
||||
};
|
||||
for (size_t i = 0; i < kNRows * page->ellpack_matrix.row_stride; ++i) {
|
||||
for (size_t i = 0; i < kNRows * page->matrix.info.row_stride; ++i) {
|
||||
ASSERT_EQ(solution[i], gidx[i]);
|
||||
}
|
||||
}
|
||||
|
||||
26
tests/cpp/data/test_sparse_page_dmatrix.cu
Normal file
26
tests/cpp/data/test_sparse_page_dmatrix.cu
Normal file
@@ -0,0 +1,26 @@
|
||||
// Copyright by Contributors
|
||||
|
||||
#include <dmlc/filesystem.h>
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
TEST(GPUSparsePageDMatrix, EllpackPage) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/simple.libsvm";
|
||||
CreateSimpleTestData(tmp_file);
|
||||
DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", true, false);
|
||||
|
||||
// Loop over the batches and assert the data is as expected
|
||||
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256, 64})) {
|
||||
EXPECT_EQ(batch.Size(), dmat->Info().num_row_);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.ellpack.page"));
|
||||
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
@@ -192,14 +192,14 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||
return dmat;
|
||||
}
|
||||
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
||||
size_t page_size, bool deterministic) {
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
||||
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
|
||||
const dmlc::TemporaryDirectory& tempdir) {
|
||||
if (!n_rows || !n_cols) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Create the svm file in a temp dir
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
||||
|
||||
std::ofstream fo(tmp_file.c_str());
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <dmlc/filesystem.h>
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/objective.h>
|
||||
#include <xgboost/metric.h>
|
||||
@@ -199,8 +200,9 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
||||
*
|
||||
* \return The new dmatrix.
|
||||
*/
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
|
||||
size_t page_size, bool deterministic);
|
||||
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
||||
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
|
||||
const dmlc::TemporaryDirectory& tempdir = dmlc::TemporaryDirectory());
|
||||
|
||||
gbm::GBTreeModel CreateTestModel();
|
||||
|
||||
@@ -247,16 +249,15 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
|
||||
0.26f, 0.71f, 1.83f});
|
||||
cmat.SetMins({0.1f, 0.2f, 0.3f, 0.1f, 0.2f, 0.3f, 0.2f, 0.2f});
|
||||
|
||||
auto is_dense = (*dmat)->Info().num_nonzero_ ==
|
||||
(*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
|
||||
size_t row_stride = 0;
|
||||
const auto &offset_vec = batch.offset.ConstHostVector();
|
||||
for (size_t i = 1; i < offset_vec.size(); ++i) {
|
||||
row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
|
||||
}
|
||||
|
||||
auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get()));
|
||||
page->InitCompressedData(0, cmat, row_stride, is_dense);
|
||||
auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get(), {0, 256, 0}));
|
||||
page->InitInfo(0, (*dmat)->IsDense(), row_stride, cmat);
|
||||
page->InitCompressedData(0, n_rows);
|
||||
page->CreateHistIndices(0, batch, RowStateOnDevice(batch.Size(), batch.Size()));
|
||||
|
||||
delete dmat;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
* Copyright 2017-2019 XGBoost contributors
|
||||
*/
|
||||
#include <thrust/device_vector.h>
|
||||
#include <dmlc/filesystem.h>
|
||||
#include <xgboost/base.h>
|
||||
#include <random>
|
||||
#include <string>
|
||||
@@ -207,14 +208,14 @@ TEST(GpuHist, EvaluateSplits) {
|
||||
|
||||
// Copy cut matrix to device.
|
||||
maker.ba.Allocate(0,
|
||||
&(page->ellpack_matrix.feature_segments), cmat.Ptrs().size(),
|
||||
&(page->ellpack_matrix.min_fvalue), cmat.MinValues().size(),
|
||||
&(page->ellpack_matrix.gidx_fvalue_map), 24,
|
||||
&(page->matrix.info.feature_segments), cmat.Ptrs().size(),
|
||||
&(page->matrix.info.min_fvalue), cmat.MinValues().size(),
|
||||
&(page->matrix.info.gidx_fvalue_map), 24,
|
||||
&(maker.monotone_constraints), kNCols);
|
||||
dh::CopyVectorToDeviceSpan(page->ellpack_matrix.feature_segments, cmat.Ptrs());
|
||||
dh::CopyVectorToDeviceSpan(page->ellpack_matrix.gidx_fvalue_map, cmat.Values());
|
||||
dh::CopyVectorToDeviceSpan(page->matrix.info.feature_segments, cmat.Ptrs());
|
||||
dh::CopyVectorToDeviceSpan(page->matrix.info.gidx_fvalue_map, cmat.Values());
|
||||
dh::CopyVectorToDeviceSpan(maker.monotone_constraints, param.monotone_constraints);
|
||||
dh::CopyVectorToDeviceSpan(page->ellpack_matrix.min_fvalue, cmat.MinValues());
|
||||
dh::CopyVectorToDeviceSpan(page->matrix.info.min_fvalue, cmat.MinValues());
|
||||
|
||||
// Initialize GPUHistMakerDevice::hist
|
||||
maker.hist.Init(0, (max_bins - 1) * kNCols);
|
||||
@@ -265,8 +266,10 @@ void TestHistogramIndexImpl() {
|
||||
tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker, hist_maker_ext;
|
||||
std::unique_ptr<DMatrix> hist_maker_dmat(
|
||||
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 0, true));
|
||||
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
std::unique_ptr<DMatrix> hist_maker_ext_dmat(
|
||||
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true));
|
||||
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true, tempdir));
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> training_params = {
|
||||
{"max_depth", "10"},
|
||||
@@ -275,22 +278,21 @@ void TestHistogramIndexImpl() {
|
||||
|
||||
GenericParameter generic_param(CreateEmptyGenericParam(0));
|
||||
hist_maker.Configure(training_params, &generic_param);
|
||||
|
||||
hist_maker.InitDataOnce(hist_maker_dmat.get());
|
||||
hist_maker_ext.Configure(training_params, &generic_param);
|
||||
hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
|
||||
|
||||
// Extract the device maker from the histogram makers and from that its compressed
|
||||
// histogram index
|
||||
const auto &maker = hist_maker.maker_;
|
||||
const auto &maker = hist_maker.maker;
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, maker->page->gidx_buffer);
|
||||
|
||||
const auto &maker_ext = hist_maker_ext.maker_;
|
||||
const auto &maker_ext = hist_maker_ext.maker;
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, maker_ext->page->gidx_buffer);
|
||||
|
||||
ASSERT_EQ(maker->page->n_bins, maker_ext->page->n_bins);
|
||||
ASSERT_EQ(maker->page->matrix.info.n_bins, maker_ext->page->matrix.info.n_bins);
|
||||
ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
|
||||
|
||||
ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
|
||||
|
||||
Reference in New Issue
Block a user