Implement GK sketching on GPU. (#5846)

* Implement GK sketching on GPU.
* Strong tests on quantile building.
* Handle sparse dataset by binary searching the column index.
* Hypothesis test on dask.
This commit is contained in:
Jiaming Yuan
2020-07-07 12:16:21 +08:00
committed by GitHub
parent ac3f0e78dc
commit 048d969be4
25 changed files with 2045 additions and 405 deletions

View File

@@ -5,6 +5,7 @@
#include <thrust/device_vector.h>
#include <xgboost/base.h>
#include "../../../src/common/device_helpers.cuh"
#include "../../../src/common/quantile.h"
#include "../helpers.h"
#include "gtest/gtest.h"
@@ -14,3 +15,128 @@ TEST(SumReduce, Test) {
ASSERT_NEAR(sum, 100.0f, 1e-5);
}
void TestAtomicSizeT() {
size_t constexpr kThreads = 235;
dh::device_vector<size_t> out(1, 0);
auto d_out = dh::ToSpan(out);
dh::LaunchN(0, kThreads, [=]__device__(size_t idx){
atomicAdd(&d_out[0], static_cast<size_t>(1));
});
ASSERT_EQ(out[0], kThreads);
}
TEST(AtomicAdd, SizeT) {
TestAtomicSizeT();
}
TEST(SegmentedUnique, Basic) {
std::vector<float> values{0.1f, 0.2f, 0.3f, 0.62448811531066895f, 0.62448811531066895f, 0.4f};
std::vector<size_t> segments{0, 3, 6};
thrust::device_vector<float> d_values(values);
thrust::device_vector<xgboost::bst_feature_t> d_segments{segments};
thrust::device_vector<xgboost::bst_feature_t> d_segs_out(d_segments.size());
thrust::device_vector<float> d_vals_out(d_values.size());
size_t n_uniques = dh::SegmentedUnique(
d_segments.data().get(), d_segments.data().get() + d_segments.size(),
d_values.data().get(), d_values.data().get() + d_values.size(),
d_segs_out.data().get(), d_vals_out.data().get(),
thrust::equal_to<float>{});
CHECK_EQ(n_uniques, 5);
std::vector<float> values_sol{0.1f, 0.2f, 0.3f, 0.62448811531066895f, 0.4f};
for (auto i = 0 ; i < values_sol.size(); i ++) {
ASSERT_EQ(d_vals_out[i], values_sol[i]);
}
std::vector<xgboost::bst_feature_t> segments_sol{0, 3, 5};
for (size_t i = 0; i < d_segments.size(); ++i) {
ASSERT_EQ(segments_sol[i], d_segs_out[i]);
}
d_segments[1] = 4;
d_segments[2] = 6;
n_uniques = dh::SegmentedUnique(
d_segments.data().get(), d_segments.data().get() + d_segments.size(),
d_values.data().get(), d_values.data().get() + d_values.size(),
d_segs_out.data().get(), d_vals_out.data().get(),
thrust::equal_to<float>{});
ASSERT_EQ(n_uniques, values.size());
for (auto i = 0 ; i < values.size(); i ++) {
ASSERT_EQ(d_vals_out[i], values[i]);
}
}
namespace {
using SketchEntry = xgboost::common::WQSummary<float, float>::Entry;
struct SketchUnique {
bool __device__ operator()(SketchEntry const& a, SketchEntry const& b) const {
return a.value - b.value == 0;
}
};
struct IsSorted {
bool __device__ operator()(SketchEntry const& a, SketchEntry const& b) const {
return a.value < b.value;
}
};
} // namespace
namespace xgboost {
namespace common {
void TestSegmentedUniqueRegression(std::vector<SketchEntry> values, size_t n_duplicated) {
std::vector<bst_feature_t> segments{0, static_cast<bst_feature_t>(values.size())};
thrust::device_vector<SketchEntry> d_values(values);
thrust::device_vector<bst_feature_t> d_segments(segments);
thrust::device_vector<bst_feature_t> d_segments_out(segments.size());
size_t n_uniques = dh::SegmentedUnique(
d_segments.data().get(), d_segments.data().get() + d_segments.size(), d_values.data().get(),
d_values.data().get() + d_values.size(), d_segments_out.data().get(), d_values.data().get(),
SketchUnique{});
ASSERT_EQ(n_uniques, values.size() - n_duplicated);
ASSERT_TRUE(thrust::is_sorted(thrust::device, d_values.begin(),
d_values.begin() + n_uniques, IsSorted{}));
ASSERT_EQ(segments.at(0), d_segments_out[0]);
ASSERT_EQ(segments.at(1), d_segments_out[1] + n_duplicated);
}
TEST(SegmentedUnique, Regression) {
{
std::vector<SketchEntry> values{{3149, 3150, 1, 0.62392902374267578},
{3151, 3152, 1, 0.62418866157531738},
{3152, 3153, 1, 0.62419462203979492},
{3153, 3154, 1, 0.62431186437606812},
{3154, 3155, 1, 0.6244881153106689453125},
{3155, 3156, 1, 0.6244881153106689453125},
{3155, 3156, 1, 0.6244881153106689453125},
{3155, 3156, 1, 0.6244881153106689453125},
{3157, 3158, 1, 0.62552797794342041},
{3158, 3159, 1, 0.6256556510925293},
{3159, 3160, 1, 0.62571090459823608},
{3160, 3161, 1, 0.62577134370803833}};
TestSegmentedUniqueRegression(values, 3);
}
{
std::vector<SketchEntry> values{{3149, 3150, 1, 0.62392902374267578},
{3151, 3152, 1, 0.62418866157531738},
{3152, 3153, 1, 0.62419462203979492},
{3153, 3154, 1, 0.62431186437606812},
{3154, 3155, 1, 0.6244881153106689453125},
{3157, 3158, 1, 0.62552797794342041},
{3158, 3159, 1, 0.6256556510925293},
{3159, 3160, 1, 0.62571090459823608},
{3160, 3161, 1, 0.62577134370803833}};
TestSegmentedUniqueRegression(values, 0);
}
{
std::vector<SketchEntry> values;
TestSegmentedUniqueRegression(values, 0);
}
}
} // namespace common
} // namespace xgboost

View File

@@ -30,11 +30,12 @@ HistogramCuts GetHostCuts(AdapterT *adapter, int num_bins, float missing) {
builder.Build(&dmat, num_bins);
return cuts;
}
TEST(HistUtil, DeviceSketch) {
int num_rows = 5;
int num_columns = 1;
int num_bins = 4;
std::vector<float> x = {1.0, 2.0, 3.0, 4.0, 5.0};
std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, -1.0f};
int num_rows = x.size();
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
@@ -47,26 +48,6 @@ TEST(HistUtil, DeviceSketch) {
EXPECT_EQ(device_cuts.MinValues(), host_cuts.MinValues());
}
// Duplicate this function from hist_util.cu so we don't have to expose it in
// header
size_t RequiredSampleCutsTest(int max_bins, size_t num_rows) {
double eps = 1.0 / (SketchContainer::kFactor * max_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
num_rows, eps, &dummy_nlevel, &num_cuts);
return std::min(num_cuts, num_rows);
}
size_t BytesRequiredForTest(size_t num_rows, size_t num_columns, size_t num_bins,
bool with_weights) {
size_t bytes_num_elements = BytesPerElement(with_weights) * num_rows * num_columns;
size_t bytes_cuts = RequiredSampleCutsTest(num_bins, num_rows) * num_columns *
sizeof(DenseCuts::WQSketch::Entry);
// divide by 2 is because the memory quota used in sorting is reused for storing cuts.
return bytes_num_elements / 2 + bytes_cuts;
}
TEST(HistUtil, DeviceSketchMemory) {
int num_columns = 100;
int num_rows = 1000;
@@ -77,15 +58,15 @@ TEST(HistUtil, DeviceSketchMemory) {
dh::GlobalMemoryLogger().Clear();
ConsoleLogger::Configure({{"verbosity", "3"}});
auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
ConsoleLogger::Configure({{"verbosity", "0"}});
size_t bytes_required = BytesRequiredForTest(num_rows, num_columns, num_bins, false);
size_t bytes_constant = 1000;
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required + bytes_constant);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
size_t bytes_required = detail::RequiredMemory(
num_rows, num_columns, num_rows * num_columns, num_bins, false);
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 0.95);
ConsoleLogger::Configure({{"verbosity", "0"}});
}
TEST(HistUtil, DeviceSketchMemoryWeights) {
TEST(HistUtil, DeviceSketchWeightsMemory) {
int num_columns = 100;
int num_rows = 1000;
int num_bins = 256;
@@ -98,7 +79,8 @@ TEST(HistUtil, DeviceSketchMemoryWeights) {
auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
ConsoleLogger::Configure({{"verbosity", "0"}});
size_t bytes_required = BytesRequiredForTest(num_rows, num_columns, num_bins, true);
size_t bytes_required = detail::RequiredMemory(
num_rows, num_columns, num_rows * num_columns, num_bins, true);
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
}
@@ -118,7 +100,7 @@ TEST(HistUtil, DeviceSketchDeterminism) {
}
}
TEST(HistUtil, DeviceSketchCategorical) {
TEST(HistUtil, DeviceSketchCategorical) {
int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256;
int sizes[] = {25, 100, 1000};
@@ -231,11 +213,10 @@ template <typename Adapter>
void ValidateBatchedCuts(Adapter adapter, int num_bins, int num_columns, int num_rows,
DMatrix* dmat) {
common::HistogramCuts batched_cuts;
SketchContainer sketch_container(num_bins, num_columns, num_rows);
SketchContainer sketch_container(num_bins, num_columns, num_rows, 0);
AdapterDeviceSketch(adapter.Value(), num_bins, std::numeric_limits<float>::quiet_NaN(),
0, &sketch_container);
common::DenseCuts dense_cuts(&batched_cuts);
dense_cuts.Init(&sketch_container.sketches_, num_bins, num_rows);
&sketch_container);
sketch_container.MakeCuts(&batched_cuts);
ValidateCuts(batched_cuts, dmat, num_bins);
}
@@ -275,12 +256,13 @@ TEST(HistUtil, AdapterDeviceSketchMemory) {
std::numeric_limits<float>::quiet_NaN());
ConsoleLogger::Configure({{"verbosity", "0"}});
size_t bytes_constant = 1000;
size_t bytes_required = BytesRequiredForTest(num_rows, num_columns, num_bins, false);
size_t bytes_required = detail::RequiredMemory(
num_rows, num_columns, num_rows * num_columns, num_bins, false);
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required + bytes_constant);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 0.95);
}
TEST(HistUtil, AdapterSketchBatchMemory) {
TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
int num_columns = 100;
int num_rows = 1000;
int num_bins = 256;
@@ -291,17 +273,19 @@ TEST(HistUtil, AdapterSketchBatchMemory) {
dh::GlobalMemoryLogger().Clear();
ConsoleLogger::Configure({{"verbosity", "3"}});
common::HistogramCuts batched_cuts;
SketchContainer sketch_container(num_bins, num_columns, num_rows);
SketchContainer sketch_container(num_bins, num_columns, num_rows, 0);
AdapterDeviceSketch(adapter.Value(), num_bins, std::numeric_limits<float>::quiet_NaN(),
0, &sketch_container);
&sketch_container);
HistogramCuts cuts;
sketch_container.MakeCuts(&cuts);
size_t bytes_required = detail::RequiredMemory(
num_rows, num_columns, num_rows * num_columns, num_bins, false);
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 0.95);
ConsoleLogger::Configure({{"verbosity", "0"}});
size_t bytes_constant = 1000;
size_t bytes_required = BytesRequiredForTest(num_rows, num_columns, num_bins, false);
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required + bytes_constant);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
}
TEST(HistUtil, AdapterSketchBatchWeightedMemory) {
TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
int num_columns = 100;
int num_rows = 1000;
int num_bins = 256;
@@ -316,12 +300,15 @@ TEST(HistUtil, AdapterSketchBatchWeightedMemory) {
dh::GlobalMemoryLogger().Clear();
ConsoleLogger::Configure({{"verbosity", "3"}});
common::HistogramCuts batched_cuts;
SketchContainer sketch_container(num_bins, num_columns, num_rows);
SketchContainer sketch_container(num_bins, num_columns, num_rows, 0);
AdapterDeviceSketchWeighted(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(), 0,
std::numeric_limits<float>::quiet_NaN(),
&sketch_container);
HistogramCuts cuts;
sketch_container.MakeCuts(&cuts);
ConsoleLogger::Configure({{"verbosity", "0"}});
size_t bytes_required = BytesRequiredForTest(num_rows, num_columns, num_bins, true);
size_t bytes_required = detail::RequiredMemory(
num_rows, num_columns, num_rows * num_columns, num_bins, true);
EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
}
@@ -462,13 +449,11 @@ void TestAdapterSketchFromWeights(bool with_group) {
data::CupyAdapter adapter(m);
auto const& batch = adapter.Value();
SketchContainer sketch_container(kBins, kCols, kRows);
SketchContainer sketch_container(kBins, kCols, kRows, 0);
AdapterDeviceSketchWeighted(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
0,
&sketch_container);
common::HistogramCuts cuts;
common::DenseCuts dense_cuts(&cuts);
dense_cuts.Init(&sketch_container.sketches_, kBins, kRows);
sketch_container.MakeCuts(&cuts);
auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
if (with_group) {

View File

@@ -117,7 +117,7 @@ inline void TestBinDistribution(const HistogramCuts &cuts, int column_idx,
// First and last bin can have smaller
for (auto& kv : bin_weights) {
EXPECT_LE(std::abs(bin_weights[kv.first] - expected_bin_weight),
ASSERT_LE(std::abs(bin_weights[kv.first] - expected_bin_weight),
allowable_error);
}
}
@@ -189,7 +189,7 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
// Collect data into columns
std::vector<std::vector<float>> columns(dmat->Info().num_col_);
for (auto& batch : dmat->GetBatches<SparsePage>()) {
CHECK_GT(batch.Size(), 0);
ASSERT_GT(batch.Size(), 0);
for (auto i = 0ull; i < batch.Size(); i++) {
for (auto e : batch[i]) {
columns[e.index].push_back(e.fvalue);

0
tests/cpp/common/test_partition_builder.cc Executable file → Normal file
View File

View File

@@ -0,0 +1,480 @@
#include <gtest/gtest.h>
#include "../helpers.h"
#include "../../../src/common/hist_util.cuh"
#include "../../../src/common/quantile.cuh"
namespace xgboost {
namespace common {
TEST(GPUQuantile, Basic) {
constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
SketchContainer sketch(kBins, kCols, kRows, 0);
dh::caching_device_vector<SketchEntry> entries;
dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
// Push empty
sketch.Push(dh::ToSpan(cuts_ptr), &entries);
ASSERT_EQ(sketch.Data().size(), 0);
}
template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
std::vector<int32_t> seeds(4);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(3, 1000);
std::generate(seeds.begin(), seeds.end(), [&](){ return dist(&lcg); });
std::vector<size_t> bins(8);
for (size_t i = 0; i < bins.size() - 1; ++i) {
bins[i] = i * 35 + 2;
}
bins.back() = rows + 80; // provide a bin number greater than rows.
std::vector<MetaInfo> infos(2);
auto& h_weights = infos.front().weights_.HostVector();
h_weights.resize(rows);
std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
for (auto seed : seeds) {
for (auto n_bin : bins) {
for (auto const& info : infos) {
fn(seed, n_bin, info);
}
}
}
}
void TestSketchUnique(float sparsity) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
SketchContainer sketch(n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
.Seed(seed)
.Device(0)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketchWeighted(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch);
auto n_cuts = detail::RequiredSampleCutsPerColumn(n_bins, kRows);
dh::caching_device_vector<size_t> column_sizes_scan;
HostDeviceVector<size_t> cut_sizes_scan;
auto batch = adapter.Value();
data::IsValidFunctor is_valid(std::numeric_limits<float>::quiet_NaN());
auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); });
auto end = kCols * kRows;
detail::GetColumnSizesScan(0, kCols, n_cuts, batch_iter, is_valid, 0, end,
&cut_sizes_scan, &column_sizes_scan);
auto const& cut_sizes = cut_sizes_scan.HostVector();
if (sparsity == 0) {
ASSERT_EQ(sketch.Data().size(), n_cuts * kCols);
} else {
ASSERT_EQ(sketch.Data().size(), cut_sizes.back());
}
sketch.Unique();
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
sketch.Data().data() + sketch.Data().size(),
detail::SketchUnique{}));
});
}
TEST(GPUQuantile, Unique) {
TestSketchUnique(0);
TestSketchUnique(0.5);
}
// if with_error is true, the test tolerates floating point error
void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
dh::LaunchN(device, in.size(), [=]XGBOOST_DEVICE(size_t idx) {
auto column_id = dh::SegmentId(d_columns_ptr, idx);
auto in_column = in.subspan(d_columns_ptr[column_id],
d_columns_ptr[column_id + 1] -
d_columns_ptr[column_id]);
auto constexpr kEps = 1e-6f;
idx -= d_columns_ptr[column_id];
float prev_rmin = idx == 0 ? 0.0f : in_column[idx-1].rmin;
float prev_rmax = idx == 0 ? 0.0f : in_column[idx-1].rmax;
float rmin_next = in_column[idx].RMinNext();
if (with_error) {
SPAN_CHECK(in_column[idx].rmin + in_column[idx].rmin * kEps >= prev_rmin);
SPAN_CHECK(in_column[idx].rmax + in_column[idx].rmin * kEps >= prev_rmax);
SPAN_CHECK(in_column[idx].rmax + in_column[idx].rmin * kEps >= rmin_next);
} else {
SPAN_CHECK(in_column[idx].rmin >= prev_rmin);
SPAN_CHECK(in_column[idx].rmax >= prev_rmax);
SPAN_CHECK(in_column[idx].rmax >= rmin_next);
}
});
// Force sync to terminate current test instead of a later one.
dh::DebugSyncDevice(__FILE__, __LINE__);
}
TEST(GPUQuantile, Prune) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
SketchContainer sketch(n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketchWeighted(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch);
auto n_cuts = detail::RequiredSampleCutsPerColumn(n_bins, kRows);
ASSERT_EQ(sketch.Data().size(), n_cuts * kCols);
sketch.Prune(n_bins);
if (n_bins <= kRows) {
ASSERT_EQ(sketch.Data().size(), n_bins * kCols);
} else {
// LE because kRows * kCols is pushed into sketch, after removing duplicated entries
// we might not have that much inputs for prune.
ASSERT_LE(sketch.Data().size(), kRows * kCols);
}
// This is not necessarily true for all inputs without calling unique after
// prune.
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
sketch.Data().data() + sketch.Data().size(),
detail::SketchUnique{}));
TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
});
}
TEST(GPUQuantile, MergeEmpty) {
constexpr size_t kRows = 1000, kCols = 100;
size_t n_bins = 10;
SketchContainer sketch_0(n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage_0;
std::string interface_str_0 =
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
&storage_0);
data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketch(adapter_0.Value(), n_bins,
std::numeric_limits<float>::quiet_NaN(), &sketch_0);
std::vector<SketchEntry> entries_before(sketch_0.Data().size());
dh::CopyDeviceSpanToVector(&entries_before, sketch_0.Data());
std::vector<bst_row_t> ptrs_before(sketch_0.ColumnsPtr().size());
dh::CopyDeviceSpanToVector(&ptrs_before, sketch_0.ColumnsPtr());
thrust::device_vector<size_t> columns_ptr(kCols + 1);
// Merge an empty sketch
sketch_0.Merge(dh::ToSpan(columns_ptr), Span<SketchEntry>{});
std::vector<SketchEntry> entries_after(sketch_0.Data().size());
dh::CopyDeviceSpanToVector(&entries_after, sketch_0.Data());
std::vector<bst_row_t> ptrs_after(sketch_0.ColumnsPtr().size());
dh::CopyDeviceSpanToVector(&ptrs_after, sketch_0.ColumnsPtr());
CHECK_EQ(entries_before.size(), entries_after.size());
CHECK_EQ(ptrs_before.size(), ptrs_after.size());
for (size_t i = 0; i < entries_before.size(); ++i) {
CHECK_EQ(entries_before[i].value, entries_after[i].value);
CHECK_EQ(entries_before[i].rmin, entries_after[i].rmin);
CHECK_EQ(entries_before[i].rmax, entries_after[i].rmax);
CHECK_EQ(entries_before[i].wmin, entries_after[i].wmin);
}
for (size_t i = 0; i < ptrs_before.size(); ++i) {
CHECK_EQ(ptrs_before[i], ptrs_after[i]);
}
}
TEST(GPUQuantile, MergeBasic) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
SketchContainer sketch_0(n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage_0;
std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_0);
data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketchWeighted(adapter_0.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch_0);
SketchContainer sketch_1(n_bins, kCols, kRows * kRows, 0);
HostDeviceVector<float> storage_1;
std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_1);
data::CupyAdapter adapter_1(interface_str_1);
AdapterDeviceSketchWeighted(adapter_1.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch_1);
size_t size_before_merge = sketch_0.Data().size();
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
if (info.weights_.Size() != 0) {
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
sketch_0.FixError();
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
} else {
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
}
auto columns_ptr = sketch_0.ColumnsPtr();
std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
sketch_0.Unique();
ASSERT_TRUE(
thrust::is_sorted(thrust::device, sketch_0.Data().data(),
sketch_0.Data().data() + sketch_0.Data().size(),
detail::SketchUnique{}));
});
}
void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
MetaInfo info;
int32_t seed = 0;
SketchContainer sketch_0(n_bins, cols, rows, 0);
HostDeviceVector<float> storage_0;
std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_0);
data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketchWeighted(adapter_0.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&sketch_0);
size_t f_rows = rows * frac;
SketchContainer sketch_1(n_bins, cols, f_rows, 0);
HostDeviceVector<float> storage_1;
std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_1);
auto data_1 = storage_1.DeviceSpan();
auto tuple_it = thrust::make_tuple(
thrust::make_counting_iterator<size_t>(0ul), data_1.data());
using Tuple = thrust::tuple<size_t, float>;
auto it = thrust::make_zip_iterator(tuple_it);
thrust::transform(thrust::device, it, it + data_1.size(), data_1.data(),
[=] __device__(Tuple const &tuple) {
auto i = thrust::get<0>(tuple);
if (thrust::get<0>(tuple) % 2 == 0) {
return 0.0f;
} else {
return thrust::get<1>(tuple);
}
});
data::CupyAdapter adapter_1(interface_str_1);
AdapterDeviceSketchWeighted(adapter_1.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&sketch_1);
size_t size_before_merge = sketch_0.Data().size();
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
auto columns_ptr = sketch_0.ColumnsPtr();
std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
sketch_0.Unique();
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch_0.Data().data(),
sketch_0.Data().data() + sketch_0.Data().size(),
detail::SketchUnique{}));
}
TEST(GPUQuantile, MergeDuplicated) {
size_t n_bins = 256;
constexpr size_t kRows = 1000, kCols = 100;
for (float frac = 0.5; frac < 2.5; frac += 0.5) {
TestMergeDuplicated(n_bins, kRows, kCols, frac);
}
}
void InitRabitContext(std::string msg) {
auto n_gpus = AllVisibleGPUs();
auto port = std::getenv("DMLC_TRACKER_PORT");
std::string port_str;
if (port) {
port_str = port;
} else {
LOG(WARNING) << msg << " as `DMLC_TRACKER_PORT` is not set up.";
return;
}
std::vector<std::string> envs{
"DMLC_TRACKER_PORT=" + port_str,
"DMLC_TRACKER_URI=127.0.0.1",
"DMLC_NUM_WORKER=" + std::to_string(n_gpus)};
char* c_envs[] {&(envs[0][0]), &(envs[1][0]), &(envs[2][0])};
rabit::Init(3, c_envs);
}
TEST(GPUQuantile, AllReduceBasic) {
// This test is supposed to run by a python test that setups the environment.
std::string msg {"Skipping AllReduce test"};
#if defined(__linux__) && defined(XGBOOST_USE_NCCL)
InitRabitContext(msg);
auto n_gpus = AllVisibleGPUs();
auto world = rabit::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
// Set up single node version;
SketchContainer sketch_on_single_node(n_bins, kCols, kRows, 0);
size_t intermediate_num_cuts =
std::min(kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
std::vector<SketchContainer> containers;
for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
containers.emplace_back(n_bins, kCols, kRows, 0);
AdapterDeviceSketchWeighted(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&containers.back());
}
for (auto& sketch : containers) {
sketch.Prune(intermediate_num_cuts);
sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
sketch_on_single_node.FixError();
}
sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr());
// Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data.
auto rank = rabit::GetRank();
SketchContainer sketch_distributed(n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketchWeighted(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&sketch_distributed);
sketch_distributed.AllReduce();
sketch_distributed.Unique();
ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
sketch_on_single_node.ColumnsPtr().size());
ASSERT_EQ(sketch_distributed.Data().size(),
sketch_on_single_node.Data().size());
TestQuantileElemRank(0, sketch_distributed.Data(),
sketch_distributed.ColumnsPtr());
std::vector<SketchEntry> single_node_data(
sketch_on_single_node.Data().size());
dh::CopyDeviceSpanToVector(&single_node_data, sketch_on_single_node.Data());
std::vector<SketchEntry> distributed_data(sketch_distributed.Data().size());
dh::CopyDeviceSpanToVector(&distributed_data, sketch_distributed.Data());
float Eps = 2e-4 * world;
for (size_t i = 0; i < single_node_data.size(); ++i) {
ASSERT_NEAR(single_node_data[i].value, distributed_data[i].value, Eps);
ASSERT_NEAR(single_node_data[i].rmax, distributed_data[i].rmax, Eps);
ASSERT_NEAR(single_node_data[i].rmin, distributed_data[i].rmin, Eps);
ASSERT_NEAR(single_node_data[i].wmin, distributed_data[i].wmin, Eps);
}
});
rabit::Finalize();
#else
LOG(WARNING) << msg;
return;
#endif // !defined(__linux__) && defined(XGBOOST_USE_NCCL)
}
TEST(GPUQuantile, SameOnAllWorkers) {
std::string msg {"Skipping SameOnAllWorkers test"};
#if defined(__linux__) && defined(XGBOOST_USE_NCCL)
InitRabitContext(msg);
auto world = rabit::GetWorldSize();
auto n_gpus = AllVisibleGPUs();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
auto rank = rabit::GetRank();
SketchContainer sketch_distributed(n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketchWeighted(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&sketch_distributed);
sketch_distributed.AllReduce();
sketch_distributed.Unique();
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr());
// Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size();
rabit::Allreduce<rabit::op::Max>(&n_data, 1);
ASSERT_EQ(n_data, sketch_distributed.Data().size());
size_t size_as_float =
sketch_distributed.Data().size_bytes() / sizeof(float);
auto local_data = Span<float const>{
reinterpret_cast<float const *>(sketch_distributed.Data().data()),
size_as_float};
dh::caching_device_vector<float> all_workers(size_as_float * world);
thrust::fill(all_workers.begin(), all_workers.end(), 0);
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
dh::AllReducer reducer;
reducer.Init(0);
reducer.AllReduceSum(all_workers.data().get(), all_workers.data().get(),
all_workers.size());
reducer.Synchronize();
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
std::vector<float> h_base_line(base_line.size());
dh::CopyDeviceSpanToVector(&h_base_line, base_line);
size_t offset = 0;
for (size_t i = 0; i < world; ++i) {
auto comp = dh::ToSpan(all_workers).subspan(offset, size_as_float);
std::vector<float> h_comp(comp.size());
dh::CopyDeviceSpanToVector(&h_comp, comp);
ASSERT_EQ(comp.size(), base_line.size());
for (size_t j = 0; j < h_comp.size(); ++j) {
ASSERT_NEAR(h_base_line[j], h_comp[j], kRtEps);
}
offset += size_as_float;
}
});
#else
LOG(WARNING) << msg;
return;
#endif // !defined(__linux__) && defined(XGBOOST_USE_NCCL)
}
} // namespace common
} // namespace xgboost

View File

@@ -422,11 +422,11 @@ TEST(Span, Subspan) {
ASSERT_EQ(s4.size(), s1.size() - 2);
EXPECT_DEATH(s1.subspan(-1, 0), "\\[xgboost\\] Condition .* failed.\n");
EXPECT_DEATH(s1.subspan(16, 0), "\\[xgboost\\] Condition .* failed.\n");
EXPECT_DEATH(s1.subspan(17, 0), "\\[xgboost\\] Condition .* failed.\n");
auto constexpr kOne = static_cast<Span<int, 4>::index_type>(-1);
EXPECT_DEATH(s1.subspan<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
EXPECT_DEATH(s1.subspan<16>(), "\\[xgboost\\] Condition .* failed.\n");
EXPECT_DEATH(s1.subspan<17>(), "\\[xgboost\\] Condition .* failed.\n");
}
TEST(Span, Compare) {

0
tests/cpp/common/test_threading_utils.cc Executable file → Normal file
View File