Testing hist_util (#5251)

* Rank tests

* Remove categorical split specialisation

* Extend tests to multiple features, switch to WQSketch

* Add tests for SparseCuts

* Add external memory quantile tests, fix some existing tests
This commit is contained in:
Rory Mitchell
2020-02-14 14:36:43 +13:00
committed by GitHub
parent 911a902835
commit 24ad9dec0b
10 changed files with 354 additions and 93 deletions

View File

@@ -5,6 +5,7 @@
#include "../../../src/common/hist_util.h"
#include "../helpers.h"
#include "test_hist_util.h"
namespace xgboost {
namespace common {
@@ -152,14 +153,6 @@ TEST(CutsBuilder, SearchGroupInd) {
delete pp_dmat;
}
namespace {
class SparseCutsWrapper : public SparseCuts {
public:
std::vector<uint32_t> const& ColPtrs() const { return p_cuts_->Ptrs(); }
std::vector<float> const& ColValues() const { return p_cuts_->Values(); }
};
} // anonymous namespace
TEST(SparseCuts, SingleThreadedBuild) {
size_t constexpr kRows = 267;
size_t constexpr kCols = 31;
@@ -235,5 +228,116 @@ TEST(SparseCuts, MultiThreadedBuild) {
omp_set_num_threads(ori_nthreads);
}
TEST(hist_util, DenseCutsCategorical) {
int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256;
int sizes[] = {25, 100, 1000};
for (auto n : sizes) {
for (auto num_categories : categorical_sizes) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
std::vector<float> x_sorted(x);
std::sort(x_sorted.begin(), x_sorted.end());
auto dmat = GetDMatrixFromData(x, n, 1);
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(&dmat, num_bins);
auto cuts_from_sketch = cuts.Values();
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
EXPECT_EQ(cuts_from_sketch.size(), num_categories);
}
}
}
TEST(hist_util, DenseCutsAccuracyTest) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(&dmat, num_bins);
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
}
}
}
TEST(hist_util, DenseCutsExternalMemory) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
dmlc::TemporaryDirectory tmpdir;
auto dmat =
GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, 50, tmpdir);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
}
}
}
TEST(hist_util, SparseCutsAccuracyTest) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
SparseCuts sparse(&cuts);
sparse.Build(&dmat, num_bins);
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
}
}
}
TEST(hist_util, SparseCutsCategorical) {
int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256;
int sizes[] = {25, 100, 1000};
for (auto n : sizes) {
for (auto num_categories : categorical_sizes) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
std::vector<float> x_sorted(x);
std::sort(x_sorted.begin(), x_sorted.end());
auto dmat = GetDMatrixFromData(x, n, 1);
HistogramCuts cuts;
SparseCuts sparse(&cuts);
sparse.Build(&dmat, num_bins);
auto cuts_from_sketch = cuts.Values();
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
EXPECT_EQ(cuts_from_sketch.size(), num_categories);
}
}
}
TEST(hist_util, SparseCutsExternalMemory) {
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {100, 1000, 1500};
int num_columns = 5;
for (auto num_rows : sizes) {
auto x = GenerateRandom(num_rows, num_columns);
dmlc::TemporaryDirectory tmpdir;
auto dmat =
GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, 50, tmpdir);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
SparseCuts dense(&cuts);
dense.Build(dmat.get(), num_bins);
ValidateCuts(cuts, x, num_rows, num_columns, num_bins);
}
}
}
} // namespace common
} // namespace xgboost

View File

@@ -0,0 +1,159 @@
#pragma once
#include <gtest/gtest.h>
#include "../../../src/data/simple_dmatrix.h"
// Some helper functions used to test both GPU and CPU algorithms
//
namespace xgboost {
namespace common {
// Generate columns with different ranges
inline std::vector<float> GenerateRandom(int num_rows, int num_columns) {
std::vector<float> x(num_rows*num_columns);
std::mt19937 rng(0);
std::uniform_real_distribution<float> dist(0.0, 1.0);
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
for (auto i = 0ull; i < num_columns; i++) {
for (auto j = 0ull; j < num_rows; j++) {
x[j * num_columns + i] += i;
}
}
return x;
}
inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n,
int num_categories) {
std::vector<float> x(n);
std::mt19937 rng(0);
std::uniform_int_distribution<int> dist(0, num_categories - 1);
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
// Make sure each category is present
for(auto i = 0ull; i < num_categories; i++)
{
x[i] = i;
}
return x;
}
inline data::SimpleDMatrix GetDMatrixFromData(const std::vector<float>& x, int num_rows, int num_columns) {
data::DenseAdapter adapter(x.data(), num_rows, num_columns);
return data::SimpleDMatrix(&adapter, std::numeric_limits<float>::quiet_NaN(),
1);
}
inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
const std::vector<float>& x, int num_rows, int num_columns,
size_t page_size, const dmlc::TemporaryDirectory& tempdir) {
// Create the svm file in a temp dir
const std::string tmp_file = tempdir.path + "/temp.libsvm";
std::ofstream fo(tmp_file.c_str());
for (auto i = 0ull; i < num_rows; i++) {
std::stringstream row_data;
for (auto j = 0ull; j < num_columns; j++) {
row_data << 1 << " " << j << ":" << std::setprecision(15)
<< x[i * num_columns + j];
}
fo << row_data.str() << "\n";
}
fo.close();
return std::shared_ptr<DMatrix>(DMatrix::Load(
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size));
}
// Test that elements are approximately equally distributed among bins
inline void TestBinDistribution(const HistogramCuts& cuts, int column_idx,
const std::vector<float>& column,
int num_bins) {
std::map<int, int> counts;
for (auto& v : column) {
counts[cuts.SearchBin(v, column_idx)]++;
}
int local_num_bins = cuts.Ptrs()[column_idx + 1] - cuts.Ptrs()[column_idx];
int expected_num_elements = column.size() / local_num_bins;
// Allow about 30% deviation. This test is not very strict, it only ensures
// roughly equal distribution
int allowable_error = std::max(2, int(expected_num_elements * 0.3));
// First and last bin can have smaller
for (auto& kv : counts) {
EXPECT_LE(std::abs(counts[kv.first] - expected_num_elements),
allowable_error );
}
}
// Test sketch quantiles against the real quantiles
// Not a very strict test
inline void TestRank(const std::vector<float>& cuts,
const std::vector<float>& sorted_x) {
float eps = 0.05;
// Ignore the last cut, its special
size_t j = 0;
for (auto i = 0; i < cuts.size() - 1; i++) {
int expected_rank = ((i+1) * sorted_x.size()) / cuts.size();
while (cuts[i] > sorted_x[j]) {
j++;
}
int actual_rank = j;
int acceptable_error = std::max(2, int(sorted_x.size() * eps));
ASSERT_LE(std::abs(expected_rank - actual_rank), acceptable_error);
}
}
inline void ValidateColumn(const HistogramCuts& cuts, int column_idx,
const std::vector<float>& column,
int num_bins) {
std::vector<float> sorted_column(column);
std::sort(sorted_column.begin(), sorted_column.end());
// Check the endpoints are correct
EXPECT_LT(cuts.MinValues()[column_idx], sorted_column.front());
EXPECT_GT(cuts.Values()[cuts.Ptrs()[column_idx]], sorted_column.front());
EXPECT_GE(cuts.Values()[cuts.Ptrs()[column_idx+1]-1], sorted_column.back());
// Check the cuts are sorted
auto cuts_begin = cuts.Values().begin() + cuts.Ptrs()[column_idx];
auto cuts_end = cuts.Values().begin() + cuts.Ptrs()[column_idx + 1];
EXPECT_TRUE(std::is_sorted(cuts_begin, cuts_end));
// Check all cut points are unique
EXPECT_EQ(std::set<float>(cuts_begin, cuts_end).size(),
cuts_end - cuts_begin);
if (sorted_column.size() <= num_bins) {
// Less unique values than number of bins
// Each value should get its own bin
// First check the inputs are unique
int num_unique =
std::set<float>(sorted_column.begin(), sorted_column.end()).size();
EXPECT_EQ(num_unique, sorted_column.size());
for (auto i = 0ull; i < sorted_column.size(); i++) {
ASSERT_EQ(cuts.SearchBin(sorted_column[i], column_idx),
cuts.Ptrs()[column_idx] + i);
}
}
int num_cuts_column = cuts.Ptrs()[column_idx + 1] - cuts.Ptrs()[column_idx];
std::vector<float> column_cuts(num_cuts_column);
std::copy(cuts.Values().begin() + cuts.Ptrs()[column_idx],
cuts.Values().begin() + cuts.Ptrs()[column_idx + 1],
column_cuts.begin());
TestBinDistribution(cuts, column_idx, sorted_column, num_bins);
TestRank(column_cuts, sorted_column);
}
// x is dense and row major
inline void ValidateCuts(const HistogramCuts& cuts, std::vector<float>& x,
int num_rows, int num_columns,
int num_bins) {
for (auto i = 0ull; i < num_columns; i++) {
// Extract the column
std::vector<float> column(num_rows);
for (auto j = 0ull; j < num_rows; j++) {
column[j] = x[j*num_columns + i];
}
ValidateColumn(cuts,i, column, num_bins);
}
}
} // namespace common
} // namespace xgboost

View File

@@ -228,17 +228,23 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
std::stringstream row_data;
size_t j = 0;
if (rem_cols > 0) {
for (; j < std::min(static_cast<size_t>(rem_cols), cols_per_row); ++j) {
row_data << label(*gen) << " " << (col_idx+j) << ":" << (col_idx+j+1)*10*i;
}
rem_cols -= cols_per_row;
for (; j < std::min(static_cast<size_t>(rem_cols), cols_per_row); ++j) {
row_data << label(*gen) << " " << (col_idx + j) << ":"
<< (col_idx + j + 1) * 10 * i;
}
rem_cols -= cols_per_row;
} else {
// Take some random number of colums in [1, n_cols] and slot them here
size_t ncols = dis(*gen);
for (; j < ncols; ++j) {
size_t fid = (col_idx+j) % n_cols;
row_data << label(*gen) << " " << fid << ":" << (fid+1)*10*i;
}
// Take some random number of colums in [1, n_cols] and slot them here
std::vector<size_t> random_columns;
size_t ncols = dis(*gen);
for (; j < ncols; ++j) {
size_t fid = (col_idx + j) % n_cols;
random_columns.push_back(fid);
}
std::sort(random_columns.begin(), random_columns.end());
for (auto fid : random_columns) {
row_data << label(*gen) << " " << fid << ":" << (fid + 1) * 10 * i;
}
}
col_idx += j;

View File

@@ -342,20 +342,17 @@ TEST(GpuHist, MinSplitLoss) {
delete dmat;
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair,
DMatrix* dmat,
size_t gpu_page_size,
RegTree* tree,
HostDeviceVector<bst_float>* preds,
float subsample = 1.0f,
const std::string& sampling_method = "uniform") {
constexpr size_t kMaxBin = 2;
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
size_t gpu_page_size, RegTree* tree,
HostDeviceVector<bst_float>* preds, float subsample = 1.0f,
const std::string& sampling_method = "uniform",
int max_bin = 2) {
if (gpu_page_size > 0) {
// Loop over the batches and count the records
int64_t batch_count = 0;
int64_t row_count = 0;
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, kMaxBin, 0, gpu_page_size})) {
for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin, 0, gpu_page_size})) {
EXPECT_LT(batch.Size(), dmat->Info().num_row_);
batch_count++;
row_count += batch.Size();
@@ -366,7 +363,7 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair,
Args args{
{"max_depth", "2"},
{"max_bin", std::to_string(kMaxBin)},
{"max_bin", std::to_string(max_bin)},
{"min_child_weight", "0.0"},
{"reg_alpha", "0"},
{"reg_lambda", "0"},
@@ -386,7 +383,7 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair,
TEST(GpuHist, UniformSampling) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 2;
constexpr float kSubsample = 0.99;
constexpr float kSubsample = 0.9999;
common::GlobalRandom().seed(1994);
// Create an in-memory DMatrix.
@@ -397,25 +394,25 @@ TEST(GpuHist, UniformSampling) {
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
// Build another tree using sampling.
RegTree tree_sampling;
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample);
UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
"uniform", kRows);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_sampling_h = preds_sampling.ConstHostVector();
for (int i = 0; i < kRows; i++) {
EXPECT_NEAR(preds_h[i], preds_sampling_h[i], 2e-3);
EXPECT_NEAR(preds_h[i], preds_sampling_h[i], 1e-8);
}
}
TEST(GpuHist, GradientBasedSampling) {
constexpr size_t kRows = 4096;
constexpr size_t kCols = 2;
constexpr float kSubsample = 0.99;
constexpr float kSubsample = 0.9999;
common::GlobalRandom().seed(1994);
// Create an in-memory DMatrix.
@@ -426,12 +423,13 @@ TEST(GpuHist, GradientBasedSampling) {
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
// Build another tree using sampling.
RegTree tree_sampling;
HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "gradient_based");
UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
"gradient_based", kRows);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
@@ -459,18 +457,17 @@ TEST(GpuHist, ExternalMemory) {
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
// Build another tree using multiple ELLPACK pages.
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext);
UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();
auto preds_ext_h = preds_ext.ConstHostVector();
for (int i = 0; i < kRows; i++) {
EXPECT_NEAR(preds_h[i], preds_ext_h[i], 2e-6);
EXPECT_NEAR(preds_h[i], preds_ext_h[i], 1e-6);
}
}
@@ -495,12 +492,14 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
// Build a tree using the in-memory DMatrix.
RegTree tree;
HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod);
UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod,
kRows);
// Build another tree using multiple ELLPACK pages.
RegTree tree_ext;
HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample, kSamplingMethod);
UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext,
kSubsample, kSamplingMethod, kRows);
// Make sure the predictions are the same.
auto preds_h = preds.ConstHostVector();