Initial support for external memory in gradient index. (#7183)
* Add hessian to batch param in preparation of new approx impl. * Extract a push method for gradient index matrix. * Use span instead of vector ref for hessian in sketching. * Create a binary format for gradient index.
This commit is contained in:
26
tests/cpp/data/test_gradient_index.cc
Normal file
26
tests/cpp/data/test_gradient_index.cc
Normal file
@@ -0,0 +1,26 @@
|
||||
/*!
|
||||
* Copyright 2021 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/data/gradient_index.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
TEST(GradientIndex, ExternalMemory) {
|
||||
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
|
||||
std::vector<size_t> base_rowids;
|
||||
std::vector<float> hessian(dmat->Info().num_row_, 1);
|
||||
for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({0, 64, hessian})) {
|
||||
base_rowids.push_back(page.base_rowid);
|
||||
}
|
||||
size_t i = 0;
|
||||
for (auto const& page : dmat->GetBatches<SparsePage>()) {
|
||||
ASSERT_EQ(base_rowids[i], page.base_rowid);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
48
tests/cpp/data/test_gradient_index_page_raw_format.cc
Normal file
48
tests/cpp/data/test_gradient_index_page_raw_format.cc
Normal file
@@ -0,0 +1,48 @@
|
||||
/*!
|
||||
* Copyright 2021 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../src/data/gradient_index.h"
|
||||
#include "../../../src/data/sparse_page_source.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
TEST(GHistIndexPageRawFormat, IO) {
|
||||
std::unique_ptr<SparsePageFormat<GHistIndexMatrix>> format{
|
||||
CreatePageFormat<GHistIndexMatrix>("raw")};
|
||||
auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string path = tmpdir.path + "/ghistindex.page";
|
||||
|
||||
{
|
||||
std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
|
||||
for (auto const &index :
|
||||
m->GetBatches<GHistIndexMatrix>({GenericParameter::kCpuId, 256})) {
|
||||
format->Write(index, fo.get());
|
||||
}
|
||||
}
|
||||
|
||||
GHistIndexMatrix page;
|
||||
std::unique_ptr<dmlc::SeekStream> fi{
|
||||
dmlc::SeekStream::CreateForRead(path.c_str())};
|
||||
format->Read(&page, fi.get());
|
||||
|
||||
for (auto const &gidx :
|
||||
m->GetBatches<GHistIndexMatrix>({GenericParameter::kCpuId, 256})) {
|
||||
auto const &loaded = gidx;
|
||||
ASSERT_EQ(loaded.cut.Ptrs(), page.cut.Ptrs());
|
||||
ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
|
||||
ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
|
||||
ASSERT_EQ(loaded.base_rowid, page.base_rowid);
|
||||
ASSERT_EQ(loaded.IsDense(), page.IsDense());
|
||||
ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(),
|
||||
page.index.begin()));
|
||||
ASSERT_TRUE(std::equal(loaded.index.Offset(),
|
||||
loaded.index.Offset() + loaded.index.OffsetSize(),
|
||||
page.index.Offset()));
|
||||
}
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
Reference in New Issue
Block a user