Initial support for external memory in gradient index. (#7183)

* Add hessian to batch param in preparation of new approx impl.
* Extract a push method for gradient index matrix.
* Use span instead of vector ref for hessian in sketching.
* Create a binary format for gradient index.
This commit is contained in:
Jiaming Yuan
2021-09-13 12:40:56 +08:00
committed by GitHub
parent a0dcf6f5c1
commit 3515931305
26 changed files with 546 additions and 171 deletions

View File

@@ -0,0 +1,26 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include "../helpers.h"
#include "../../../src/data/gradient_index.h"
namespace xgboost {
namespace data {
TEST(GradientIndex, ExternalMemory) {
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
std::vector<size_t> base_rowids;
std::vector<float> hessian(dmat->Info().num_row_, 1);
for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({0, 64, hessian})) {
base_rowids.push_back(page.base_rowid);
}
size_t i = 0;
for (auto const& page : dmat->GetBatches<SparsePage>()) {
ASSERT_EQ(base_rowids[i], page.base_rowid);
++i;
}
}
} // namespace data
} // namespace xgboost

View File

@@ -0,0 +1,48 @@
/*!
* Copyright 2021 XGBoost contributors
*/
#include <gtest/gtest.h>
#include "../../../src/data/gradient_index.h"
#include "../../../src/data/sparse_page_source.h"
#include "../helpers.h"
namespace xgboost {
namespace data {
TEST(GHistIndexPageRawFormat, IO) {
std::unique_ptr<SparsePageFormat<GHistIndexMatrix>> format{
CreatePageFormat<GHistIndexMatrix>("raw")};
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
dmlc::TemporaryDirectory tmpdir;
std::string path = tmpdir.path + "/ghistindex.page";
{
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
for (auto const &index :
m->GetBatches<GHistIndexMatrix>({GenericParameter::kCpuId, 256})) {
format->Write(index, fo.get());
}
}
GHistIndexMatrix page;
std::unique_ptr<dmlc::SeekStream> fi{
dmlc::SeekStream::CreateForRead(path.c_str())};
format->Read(&page, fi.get());
for (auto const &gidx :
m->GetBatches<GHistIndexMatrix>({GenericParameter::kCpuId, 256})) {
auto const &loaded = gidx;
ASSERT_EQ(loaded.cut.Ptrs(), page.cut.Ptrs());
ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
ASSERT_EQ(loaded.base_rowid, page.base_rowid);
ASSERT_EQ(loaded.IsDense(), page.IsDense());
ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(),
page.index.begin()));
ASSERT_TRUE(std::equal(loaded.index.Offset(),
loaded.index.Offset() + loaded.index.OffsetSize(),
page.index.Offset()));
}
}
} // namespace data
} // namespace xgboost