Unify CPU hist sketching (#5880)

This commit is contained in:
Jiaming Yuan
2020-08-12 01:33:06 +08:00
committed by GitHub
parent bd6b7f4aa7
commit ee70a2380b
18 changed files with 648 additions and 677 deletions

View File

@@ -158,86 +158,20 @@ TEST(CutsBuilder, SearchGroupInd) {
HistogramCuts hmat;
size_t group_ind = CutsBuilder::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 0);
size_t group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 0);
ASSERT_EQ(group_ind, 0);
group_ind = CutsBuilder::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 5);
group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 5);
ASSERT_EQ(group_ind, 2);
EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
p_mat->Info().Validate(-1);
EXPECT_THROW(CutsBuilder::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
dmlc::Error);
std::vector<bst_uint> group_ptr {0, 1, 2};
CHECK_EQ(CutsBuilder::SearchGroupIndFromRow(group_ptr, 1), 1);
}
TEST(SparseCuts, SingleThreadedBuild) {
size_t constexpr kRows = 267;
size_t constexpr kCols = 31;
size_t constexpr kBins = 256;
auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
common::GHistIndexMatrix hmat;
hmat.Init(p_fmat.get(), kBins);
HistogramCuts cuts;
SparseCuts indices(&cuts);
auto const& page = *(p_fmat->GetBatches<xgboost::CSCPage>().begin());
indices.SingleThreadBuild(page, p_fmat->Info(), kBins, false, 0, page.Size(), 0);
ASSERT_EQ(hmat.cut.Ptrs().size(), cuts.Ptrs().size());
ASSERT_EQ(hmat.cut.Ptrs(), cuts.Ptrs());
ASSERT_EQ(hmat.cut.Values(), cuts.Values());
ASSERT_EQ(hmat.cut.MinValues(), cuts.MinValues());
}
TEST(SparseCuts, MultiThreadedBuild) {
size_t constexpr kRows = 17;
size_t constexpr kCols = 15;
size_t constexpr kBins = 255;
omp_ulong ori_nthreads = omp_get_max_threads();
omp_set_num_threads(16);
auto Compare =
#if defined(_MSC_VER) // msvc fails to capture
[kBins](DMatrix* p_fmat) {
#else
[](DMatrix* p_fmat) {
#endif
HistogramCuts threaded_container;
SparseCuts threaded_indices(&threaded_container);
threaded_indices.Build(p_fmat, kBins);
HistogramCuts container;
SparseCuts indices(&container);
auto const& page = *(p_fmat->GetBatches<xgboost::CSCPage>().begin());
indices.SingleThreadBuild(page, p_fmat->Info(), kBins, false, 0, page.Size(), 0);
ASSERT_EQ(container.Ptrs().size(), threaded_container.Ptrs().size());
ASSERT_EQ(container.Values().size(), threaded_container.Values().size());
for (uint32_t i = 0; i < container.Ptrs().size(); ++i) {
ASSERT_EQ(container.Ptrs()[i], threaded_container.Ptrs()[i]);
}
for (uint32_t i = 0; i < container.Values().size(); ++i) {
ASSERT_EQ(container.Values()[i], threaded_container.Values()[i]);
}
};
{
auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
Compare(p_fmat.get());
}
{
auto p_fmat = RandomDataGenerator(kRows, kCols, 0.0001).GenerateDMatrix();
Compare(p_fmat.get());
}
omp_set_num_threads(ori_nthreads);
CHECK_EQ(HostSketchContainer::SearchGroupIndFromRow(group_ptr, 1), 1);
}
TEST(HistUtil, DenseCutsCategorical) {
@@ -250,9 +184,7 @@ TEST(HistUtil, DenseCutsCategorical) {
std::vector<float> x_sorted(x);
std::sort(x_sorted.begin(), x_sorted.end());
auto dmat = GetDMatrixFromData(x, n, 1);
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins);
auto cuts_from_sketch = cuts.Values();
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
@@ -264,15 +196,14 @@ TEST(HistUtil, DenseCutsCategorical) {
TEST(HistUtil, DenseCutsAccuracyTest) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int sizes[] = {100};
// omp_set_num_threads(1);
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins);
ValidateCuts(cuts, dmat.get(), num_bins);
}
}
@@ -288,9 +219,7 @@ TEST(HistUtil, DenseCutsAccuracyTestWeights) {
auto w = GenerateRandomWeights(num_rows);
dmat->Info().weights_.HostVector() = w;
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins);
ValidateCuts(cuts, dmat.get(), num_bins);
}
}
@@ -306,65 +235,7 @@ TEST(HistUtil, DenseCutsExternalMemory) {
auto dmat =
GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, 50, tmpdir);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
ValidateCuts(cuts, dmat.get(), num_bins);
}
}
}
TEST(HistUtil, SparseCutsAccuracyTest) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
SparseCuts sparse(&cuts);
sparse.Build(dmat.get(), num_bins);
ValidateCuts(cuts, dmat.get(), num_bins);
}
}
}
TEST(HistUtil, SparseCutsCategorical) {
int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256;
int sizes[] = {25, 100, 1000};
for (auto n : sizes) {
for (auto num_categories : categorical_sizes) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
std::vector<float> x_sorted(x);
std::sort(x_sorted.begin(), x_sorted.end());
auto dmat = GetDMatrixFromData(x, n, 1);
HistogramCuts cuts;
SparseCuts sparse(&cuts);
sparse.Build(dmat.get(), num_bins);
auto cuts_from_sketch = cuts.Values();
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
EXPECT_EQ(cuts_from_sketch.size(), num_categories);
}
}
}
TEST(HistUtil, SparseCutsExternalMemory) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
dmlc::TemporaryDirectory tmpdir;
auto dmat =
GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, 50, tmpdir);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
SparseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins);
ValidateCuts(cuts, dmat.get(), num_bins);
}
}
@@ -391,25 +262,6 @@ TEST(HistUtil, IndexBinBound) {
}
}
TEST(HistUtil, SparseIndexBinBound) {
uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
BinTypeSize expected_bin_type_sizes[] = { kUint32BinsTypeSize,
kUint32BinsTypeSize,
kUint32BinsTypeSize };
size_t constexpr kRows = 100;
size_t constexpr kCols = 10;
size_t bin_id = 0;
for (auto max_bin : bin_sizes) {
auto p_fmat = RandomDataGenerator(kRows, kCols, 0.2).GenerateDMatrix();
common::GHistIndexMatrix hmat;
hmat.Init(p_fmat.get(), max_bin);
EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.GetBinTypeSize());
}
}
template <typename T>
void CheckIndexData(T* data_ptr, uint32_t* offsets,
const common::GHistIndexMatrix& hmat, size_t n_cols) {
@@ -448,25 +300,61 @@ TEST(HistUtil, IndexBinData) {
}
}
TEST(HistUtil, SparseIndexBinData) {
uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
size_t constexpr kRows = 100;
size_t constexpr kCols = 10;
void TestSketchFromWeights(bool with_group) {
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
size_t constexpr kGroups = 10;
auto m =
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
common::HistogramCuts cuts = SketchOnDMatrix(m.get(), kBins);
for (auto max_bin : bin_sizes) {
auto p_fmat = RandomDataGenerator(kRows, kCols, 0.2).GenerateDMatrix();
common::GHistIndexMatrix hmat;
hmat.Init(p_fmat.get(), max_bin);
EXPECT_EQ(hmat.index.Offset(), nullptr);
MetaInfo info;
auto& h_weights = info.weights_.HostVector();
if (with_group) {
h_weights.resize(kGroups);
} else {
h_weights.resize(kRows);
}
std::fill(h_weights.begin(), h_weights.end(), 1.0f);
uint32_t* data_ptr = hmat.index.data<uint32_t>();
for (size_t i = 0; i < hmat.index.Size(); ++i) {
EXPECT_EQ(data_ptr[i], hmat.index[i]);
std::vector<bst_group_t> groups(kGroups);
if (with_group) {
for (size_t i = 0; i < kGroups; ++i) {
groups[i] = kRows / kGroups;
}
info.SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
}
info.num_row_ = kRows;
info.num_col_ = kCols;
// Assign weights.
if (with_group) {
m->Info().SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
}
m->Info().SetInfo("weight", h_weights.data(), DataType::kFloat32, h_weights.size());
m->Info().num_col_ = kCols;
m->Info().num_row_ = kRows;
ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
ValidateCuts(cuts, m.get(), kBins);
if (with_group) {
HistogramCuts non_weighted = SketchOnDMatrix(m.get(), kBins);
for (size_t i = 0; i < cuts.Values().size(); ++i) {
EXPECT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
}
for (size_t i = 0; i < cuts.MinValues().size(); ++i) {
ASSERT_EQ(cuts.MinValues()[i], non_weighted.MinValues()[i]);
}
for (size_t i = 0; i < cuts.Ptrs().size(); ++i) {
ASSERT_EQ(cuts.Ptrs().at(i), non_weighted.Ptrs().at(i));
}
}
}
TEST(HistUtil, SketchFromWeights) {
TestSketchFromWeights(true);
TestSketchFromWeights(false);
}
} // namespace common
} // namespace xgboost