Testing hist_util (#5251)
* Rank tests * Remove categorical split specialisation * Extend tests to multiple features, switch to WQSketch * Add tests for SparseCuts * Add external memory quantile tests, fix some existing tests
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "../../../src/common/hist_util.h"
|
||||
#include "../helpers.h"
|
||||
#include "test_hist_util.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -152,14 +153,6 @@ TEST(CutsBuilder, SearchGroupInd) {
|
||||
delete pp_dmat;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class SparseCutsWrapper : public SparseCuts {
|
||||
public:
|
||||
std::vector<uint32_t> const& ColPtrs() const { return p_cuts_->Ptrs(); }
|
||||
std::vector<float> const& ColValues() const { return p_cuts_->Values(); }
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(SparseCuts, SingleThreadedBuild) {
|
||||
size_t constexpr kRows = 267;
|
||||
size_t constexpr kCols = 31;
|
||||
@@ -235,5 +228,116 @@ TEST(SparseCuts, MultiThreadedBuild) {
|
||||
omp_set_num_threads(ori_nthreads);
|
||||
}
|
||||
|
||||
TEST(hist_util, DenseCutsCategorical) {
|
||||
int categorical_sizes[] = {2, 6, 8, 12};
|
||||
int num_bins = 256;
|
||||
int sizes[] = {25, 100, 1000};
|
||||
for (auto n : sizes) {
|
||||
for (auto num_categories : categorical_sizes) {
|
||||
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
||||
std::vector<float> x_sorted(x);
|
||||
std::sort(x_sorted.begin(), x_sorted.end());
|
||||
auto dmat = GetDMatrixFromData(x, n, 1);
|
||||
HistogramCuts cuts;
|
||||
DenseCuts dense(&cuts);
|
||||
dense.Build(&dmat, num_bins);
|
||||
auto cuts_from_sketch = cuts.Values();
|
||||
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
|
||||
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
|
||||
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
|
||||
EXPECT_EQ(cuts_from_sketch.size(), num_categories);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, DenseCutsAccuracyTest) {
|
||||
int bin_sizes[] = {2, 16, 256, 512};
|
||||
int sizes[] = {100, 1000, 1500};
|
||||
int num_columns = 5;
|
||||
for (auto num_rows : sizes) {
|
||||
auto x = GenerateRandom(num_rows, num_columns);
|
||||
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
|
||||
for (auto num_bins : bin_sizes) {
|
||||
HistogramCuts cuts;
|
||||
DenseCuts dense(&cuts);
|
||||
dense.Build(&dmat, num_bins);
|
||||
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, DenseCutsExternalMemory) {
|
||||
int bin_sizes[] = {2, 16, 256, 512};
|
||||
int sizes[] = {100, 1000, 1500};
|
||||
int num_columns = 5;
|
||||
for (auto num_rows : sizes) {
|
||||
auto x = GenerateRandom(num_rows, num_columns);
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto dmat =
|
||||
GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, 50, tmpdir);
|
||||
for (auto num_bins : bin_sizes) {
|
||||
HistogramCuts cuts;
|
||||
DenseCuts dense(&cuts);
|
||||
dense.Build(dmat.get(), num_bins);
|
||||
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, SparseCutsAccuracyTest) {
|
||||
int bin_sizes[] = {2, 16, 256, 512};
|
||||
int sizes[] = {100, 1000, 1500};
|
||||
int num_columns = 5;
|
||||
for (auto num_rows : sizes) {
|
||||
auto x = GenerateRandom(num_rows, num_columns);
|
||||
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
|
||||
for (auto num_bins : bin_sizes) {
|
||||
HistogramCuts cuts;
|
||||
SparseCuts sparse(&cuts);
|
||||
sparse.Build(&dmat, num_bins);
|
||||
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, SparseCutsCategorical) {
|
||||
int categorical_sizes[] = {2, 6, 8, 12};
|
||||
int num_bins = 256;
|
||||
int sizes[] = {25, 100, 1000};
|
||||
for (auto n : sizes) {
|
||||
for (auto num_categories : categorical_sizes) {
|
||||
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
||||
std::vector<float> x_sorted(x);
|
||||
std::sort(x_sorted.begin(), x_sorted.end());
|
||||
auto dmat = GetDMatrixFromData(x, n, 1);
|
||||
HistogramCuts cuts;
|
||||
SparseCuts sparse(&cuts);
|
||||
sparse.Build(&dmat, num_bins);
|
||||
auto cuts_from_sketch = cuts.Values();
|
||||
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
|
||||
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
|
||||
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
|
||||
EXPECT_EQ(cuts_from_sketch.size(), num_categories);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, SparseCutsExternalMemory) {
|
||||
int bin_sizes[] = {2, 16, 256, 512};
|
||||
int sizes[] = {100, 1000, 1500};
|
||||
int num_columns = 5;
|
||||
for (auto num_rows : sizes) {
|
||||
auto x = GenerateRandom(num_rows, num_columns);
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
auto dmat =
|
||||
GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, 50, tmpdir);
|
||||
for (auto num_bins : bin_sizes) {
|
||||
HistogramCuts cuts;
|
||||
SparseCuts dense(&cuts);
|
||||
dense.Build(dmat.get(), num_bins);
|
||||
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
159
tests/cpp/common/test_hist_util.h
Normal file
159
tests/cpp/common/test_hist_util.h
Normal file
@@ -0,0 +1,159 @@
|
||||
#pragma once
|
||||
#include <gtest/gtest.h>
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
|
||||
// Some helper functions used to test both GPU and CPU algorithms
|
||||
//
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
// Generate columns with different ranges
|
||||
inline std::vector<float> GenerateRandom(int num_rows, int num_columns) {
|
||||
std::vector<float> x(num_rows*num_columns);
|
||||
std::mt19937 rng(0);
|
||||
std::uniform_real_distribution<float> dist(0.0, 1.0);
|
||||
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
|
||||
for (auto i = 0ull; i < num_columns; i++) {
|
||||
for (auto j = 0ull; j < num_rows; j++) {
|
||||
x[j * num_columns + i] += i;
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n,
|
||||
int num_categories) {
|
||||
std::vector<float> x(n);
|
||||
std::mt19937 rng(0);
|
||||
std::uniform_int_distribution<int> dist(0, num_categories - 1);
|
||||
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
|
||||
// Make sure each category is present
|
||||
for(auto i = 0ull; i < num_categories; i++)
|
||||
{
|
||||
x[i] = i;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
inline data::SimpleDMatrix GetDMatrixFromData(const std::vector<float>& x, int num_rows, int num_columns) {
|
||||
data::DenseAdapter adapter(x.data(), num_rows, num_columns);
|
||||
return data::SimpleDMatrix(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
1);
|
||||
}
|
||||
|
||||
inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
|
||||
const std::vector<float>& x, int num_rows, int num_columns,
|
||||
size_t page_size, const dmlc::TemporaryDirectory& tempdir) {
|
||||
// Create the svm file in a temp dir
|
||||
const std::string tmp_file = tempdir.path + "/temp.libsvm";
|
||||
std::ofstream fo(tmp_file.c_str());
|
||||
for (auto i = 0ull; i < num_rows; i++) {
|
||||
std::stringstream row_data;
|
||||
for (auto j = 0ull; j < num_columns; j++) {
|
||||
row_data << 1 << " " << j << ":" << std::setprecision(15)
|
||||
<< x[i * num_columns + j];
|
||||
}
|
||||
fo << row_data.str() << "\n";
|
||||
}
|
||||
fo.close();
|
||||
return std::shared_ptr<DMatrix>(DMatrix::Load(
|
||||
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
|
||||
}
|
||||
|
||||
// Test that elements are approximately equally distributed among bins
|
||||
inline void TestBinDistribution(const HistogramCuts& cuts, int column_idx,
|
||||
const std::vector<float>& column,
|
||||
int num_bins) {
|
||||
std::map<int, int> counts;
|
||||
for (auto& v : column) {
|
||||
counts[cuts.SearchBin(v, column_idx)]++;
|
||||
}
|
||||
int local_num_bins = cuts.Ptrs()[column_idx + 1] - cuts.Ptrs()[column_idx];
|
||||
int expected_num_elements = column.size() / local_num_bins;
|
||||
// Allow about 30% deviation. This test is not very strict, it only ensures
|
||||
// roughly equal distribution
|
||||
int allowable_error = std::max(2, int(expected_num_elements * 0.3));
|
||||
|
||||
// First and last bin can have smaller
|
||||
for (auto& kv : counts) {
|
||||
EXPECT_LE(std::abs(counts[kv.first] - expected_num_elements),
|
||||
allowable_error );
|
||||
}
|
||||
}
|
||||
|
||||
// Test sketch quantiles against the real quantiles
|
||||
// Not a very strict test
|
||||
inline void TestRank(const std::vector<float>& cuts,
|
||||
const std::vector<float>& sorted_x) {
|
||||
float eps = 0.05;
|
||||
// Ignore the last cut, its special
|
||||
size_t j = 0;
|
||||
for (auto i = 0; i < cuts.size() - 1; i++) {
|
||||
int expected_rank = ((i+1) * sorted_x.size()) / cuts.size();
|
||||
while (cuts[i] > sorted_x[j]) {
|
||||
j++;
|
||||
}
|
||||
int actual_rank = j;
|
||||
int acceptable_error = std::max(2, int(sorted_x.size() * eps));
|
||||
ASSERT_LE(std::abs(expected_rank - actual_rank), acceptable_error);
|
||||
}
|
||||
}
|
||||
|
||||
inline void ValidateColumn(const HistogramCuts& cuts, int column_idx,
|
||||
const std::vector<float>& column,
|
||||
int num_bins) {
|
||||
std::vector<float> sorted_column(column);
|
||||
std::sort(sorted_column.begin(), sorted_column.end());
|
||||
|
||||
// Check the endpoints are correct
|
||||
EXPECT_LT(cuts.MinValues()[column_idx], sorted_column.front());
|
||||
EXPECT_GT(cuts.Values()[cuts.Ptrs()[column_idx]], sorted_column.front());
|
||||
EXPECT_GE(cuts.Values()[cuts.Ptrs()[column_idx+1]-1], sorted_column.back());
|
||||
|
||||
// Check the cuts are sorted
|
||||
auto cuts_begin = cuts.Values().begin() + cuts.Ptrs()[column_idx];
|
||||
auto cuts_end = cuts.Values().begin() + cuts.Ptrs()[column_idx + 1];
|
||||
EXPECT_TRUE(std::is_sorted(cuts_begin, cuts_end));
|
||||
|
||||
// Check all cut points are unique
|
||||
EXPECT_EQ(std::set<float>(cuts_begin, cuts_end).size(),
|
||||
cuts_end - cuts_begin);
|
||||
|
||||
if (sorted_column.size() <= num_bins) {
|
||||
// Less unique values than number of bins
|
||||
// Each value should get its own bin
|
||||
|
||||
// First check the inputs are unique
|
||||
int num_unique =
|
||||
std::set<float>(sorted_column.begin(), sorted_column.end()).size();
|
||||
EXPECT_EQ(num_unique, sorted_column.size());
|
||||
for (auto i = 0ull; i < sorted_column.size(); i++) {
|
||||
ASSERT_EQ(cuts.SearchBin(sorted_column[i], column_idx),
|
||||
cuts.Ptrs()[column_idx] + i);
|
||||
}
|
||||
}
|
||||
int num_cuts_column = cuts.Ptrs()[column_idx + 1] - cuts.Ptrs()[column_idx];
|
||||
std::vector<float> column_cuts(num_cuts_column);
|
||||
std::copy(cuts.Values().begin() + cuts.Ptrs()[column_idx],
|
||||
cuts.Values().begin() + cuts.Ptrs()[column_idx + 1],
|
||||
column_cuts.begin());
|
||||
TestBinDistribution(cuts, column_idx, sorted_column, num_bins);
|
||||
TestRank(column_cuts, sorted_column);
|
||||
}
|
||||
|
||||
// x is dense and row major
|
||||
inline void ValidateCuts(const HistogramCuts& cuts, std::vector<float>& x,
|
||||
int num_rows, int num_columns,
|
||||
int num_bins) {
|
||||
for (auto i = 0ull; i < num_columns; i++) {
|
||||
// Extract the column
|
||||
std::vector<float> column(num_rows);
|
||||
for (auto j = 0ull; j < num_rows; j++) {
|
||||
column[j] = x[j*num_columns + i];
|
||||
}
|
||||
ValidateColumn(cuts,i, column, num_bins);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
Reference in New Issue
Block a user