[EM] Multi-level quantile sketching for GPU. (#10813)
This commit is contained in:
parent
3ef8383d93
commit
ed5f33df16
@ -105,17 +105,21 @@ def main():
|
||||
assert m_with_it.num_row() == m.num_row()
|
||||
# Tree meethod must be `hist`.
|
||||
reg_with_it = xgboost.train(
|
||||
{"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
|
||||
{"tree_method": "hist", "device": "cuda"},
|
||||
m_with_it,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m_with_it, "Train")],
|
||||
)
|
||||
predict_with_it = reg_with_it.predict(m_with_it)
|
||||
|
||||
reg = xgboost.train(
|
||||
{"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
|
||||
{"tree_method": "hist", "device": "cuda"},
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
)
|
||||
predict = reg.predict(m)
|
||||
|
||||
numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -143,17 +143,30 @@ class SketchContainer {
|
||||
*/
|
||||
void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
|
||||
/* \brief Prune the quantile structure.
|
||||
/**
|
||||
* @brief Prune the quantile structure.
|
||||
*
|
||||
* \param to The maximum size of pruned quantile. If the size of quantile
|
||||
* structure is already less than `to`, then no operation is performed.
|
||||
* @param to The maximum size of pruned quantile. If the size of quantile structure is
|
||||
* already less than `to`, then no operation is performed.
|
||||
*/
|
||||
void Prune(Context const* ctx, size_t to);
|
||||
/* \brief Merge another set of sketch.
|
||||
* \param that columns of other.
|
||||
/**
|
||||
* @brief Merge another set of sketch.
|
||||
*
|
||||
* @param that_columns_ptr Column pointer of the quantile summary being merged.
|
||||
* @param that Columns of the other quantile summary.
|
||||
*/
|
||||
void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
|
||||
Span<SketchEntry const> that);
|
||||
/**
|
||||
* @brief Shrink the internal data structure to reduce memory usage. Can be used after
|
||||
* prune.
|
||||
*/
|
||||
void ShrinkToFit() {
|
||||
this->Current().shrink_to_fit();
|
||||
this->Other().clear();
|
||||
this->Other().shrink_to_fit();
|
||||
}
|
||||
|
||||
/* \brief Merge quantiles from other GPU workers. */
|
||||
void AllReduce(Context const* ctx, bool is_column_split);
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
*/
|
||||
#include <algorithm> // for max
|
||||
#include <numeric> // for partial_sum
|
||||
#include <utility> // for pair
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../collective/allreduce.h" // for Allreduce
|
||||
@ -29,11 +30,39 @@ void MakeSketches(Context const* ctx,
|
||||
float missing, std::shared_ptr<common::HistogramCuts> cuts, MetaInfo const& info,
|
||||
ExternalDataInfo* p_ext_info) {
|
||||
xgboost_NVTX_FN_RANGE();
|
||||
|
||||
std::unique_ptr<common::SketchContainer> sketch;
|
||||
/**
|
||||
* A variant of: A Fast Algorithm for Approximate Quantiles in High Speed Data Streams
|
||||
*
|
||||
* The original algorithm was designed for CPU where input is a stream with individual
|
||||
* elements. For GPU, we process the data in batches. As a result, the implementation
|
||||
* here simply uses the user input batch as the basic unit of sketching blocks. The
|
||||
* number of blocks per-level grows exponentially.
|
||||
*/
|
||||
std::vector<std::pair<std::unique_ptr<common::SketchContainer>, bst_idx_t>> sketches;
|
||||
auto& ext_info = *p_ext_info;
|
||||
|
||||
auto lazy_init_sketch = [&] {
|
||||
// Lazy because we need the `n_features`.
|
||||
sketches.emplace_back(std::make_unique<common::SketchContainer>(
|
||||
proxy->Info().feature_types, p.max_bin, ext_info.n_features,
|
||||
data::BatchSamples(proxy), dh::GetDevice(ctx)),
|
||||
0);
|
||||
};
|
||||
|
||||
// Workaround empty input with CPU ctx.
|
||||
Context new_ctx;
|
||||
Context const* p_ctx;
|
||||
if (ctx->IsCUDA()) {
|
||||
p_ctx = ctx;
|
||||
} else {
|
||||
new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
|
||||
p_ctx = &new_ctx;
|
||||
}
|
||||
|
||||
do {
|
||||
/**
|
||||
* Get the data shape.
|
||||
*/
|
||||
// We use do while here as the first batch is fetched in ctor
|
||||
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
|
||||
common::SetDevice(dh::GetDevice(ctx).ordinal);
|
||||
@ -46,28 +75,38 @@ void MakeSketches(Context const* ctx,
|
||||
CHECK_EQ(ext_info.n_features, ::xgboost::data::BatchColumns(proxy))
|
||||
<< "Inconsistent number of columns.";
|
||||
}
|
||||
|
||||
auto batch_rows = data::BatchSamples(proxy);
|
||||
ext_info.accumulated_rows += batch_rows;
|
||||
|
||||
/**
|
||||
* Handle sketching.
|
||||
*/
|
||||
if (!ref) {
|
||||
if (!sketch) {
|
||||
sketch = std::make_unique<common::SketchContainer>(
|
||||
proxy->Info().feature_types, p.max_bin, ext_info.n_features, data::BatchSamples(proxy),
|
||||
dh::GetDevice(ctx));
|
||||
if (sketches.empty()) {
|
||||
lazy_init_sketch();
|
||||
}
|
||||
if (sketches.back().second > (1ul << (sketches.size() - 1))) {
|
||||
auto n_cuts_per_feat =
|
||||
common::detail::RequiredSampleCutsPerColumn(p.max_bin, ext_info.accumulated_rows);
|
||||
// Prune to a single block
|
||||
sketches.back().first->Prune(p_ctx, n_cuts_per_feat);
|
||||
sketches.back().first->ShrinkToFit();
|
||||
|
||||
sketches.back().second = 1;
|
||||
lazy_init_sketch(); // Add a new level.
|
||||
}
|
||||
proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
|
||||
cuda_impl::Dispatch(proxy, [&](auto const& value) {
|
||||
// Workaround empty input with CPU ctx.
|
||||
Context new_ctx;
|
||||
Context const* p_ctx;
|
||||
if (ctx->IsCUDA()) {
|
||||
p_ctx = ctx;
|
||||
} else {
|
||||
new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
|
||||
p_ctx = &new_ctx;
|
||||
}
|
||||
common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get());
|
||||
common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing,
|
||||
sketches.back().first.get());
|
||||
sketches.back().second++;
|
||||
});
|
||||
}
|
||||
auto batch_rows = data::BatchSamples(proxy);
|
||||
ext_info.accumulated_rows += batch_rows;
|
||||
|
||||
/**
|
||||
* Rest of the data shape.
|
||||
*/
|
||||
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
|
||||
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
||||
ext_info.row_stride =
|
||||
@ -87,7 +126,28 @@ void MakeSketches(Context const* ctx,
|
||||
// Get reference
|
||||
common::SetDevice(dh::GetDevice(ctx).ordinal);
|
||||
if (!ref) {
|
||||
sketch->MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
common::SketchContainer final_sketch(
|
||||
sketches.empty() ? ft : sketches.front().first->FeatureTypes(), p.max_bin,
|
||||
ext_info.n_features, ext_info.accumulated_rows, dh::GetDevice(ctx));
|
||||
// Reverse order since the last container might contain summary that's not yet pruned.
|
||||
for (auto it = sketches.crbegin(); it != sketches.crend(); ++it) {
|
||||
auto& sketch = *it;
|
||||
|
||||
CHECK_GE(sketch.second, 1);
|
||||
if (sketch.second > 1) {
|
||||
sketch.first->Prune(p_ctx, common::detail::RequiredSampleCutsPerColumn(
|
||||
p.max_bin, ext_info.accumulated_rows));
|
||||
sketch.first->ShrinkToFit();
|
||||
}
|
||||
final_sketch.Merge(p_ctx, sketch.first->ColumnsPtr(), sketch.first->Data());
|
||||
final_sketch.FixError();
|
||||
}
|
||||
|
||||
sketches.clear();
|
||||
sketches.shrink_to_fit();
|
||||
|
||||
final_sketch.MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
|
||||
} else {
|
||||
GetCutsFromRef(ctx, ref, ext_info.n_features, p, cuts.get());
|
||||
}
|
||||
|
||||
@ -289,11 +289,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
|
||||
auto page = std::make_shared<S>();
|
||||
this->exce_.Run([&] {
|
||||
std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{
|
||||
this->CreatePageFormat(this->param_)};
|
||||
self->CreatePageFormat(self->param_)};
|
||||
auto name = self->cache_info_->ShardName();
|
||||
auto [offset, length] = self->cache_info_->View(fetch_it);
|
||||
std::unique_ptr<typename FormatStreamPolicy::ReaderT> fi{
|
||||
this->CreateReader(name, offset, length)};
|
||||
self->CreateReader(name, offset, length)};
|
||||
CHECK(fmt->Read(page.get(), fi.get()));
|
||||
});
|
||||
return page;
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
#include "../../../include/xgboost/logging.h"
|
||||
#include "../../../src/common/cuda_context.cuh"
|
||||
#include "../../../src/common/cuda_rt_utils.h" // for SetDevice
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
#include "../../../src/common/hist_util.cuh"
|
||||
#include "../../../src/common/hist_util.h"
|
||||
@ -59,8 +60,7 @@ TEST(HistUtil, SketchBatchNumElements) {
|
||||
GTEST_SKIP_("Test not runnable with RMM enabled.");
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
size_t constexpr kCols = 10000;
|
||||
int device;
|
||||
dh::safe_cuda(cudaGetDevice(&device));
|
||||
std::int32_t device = dh::CurrentDevice();
|
||||
auto avail = static_cast<size_t>(dh::AvailableMemory(device) * 0.8);
|
||||
auto per_elem = detail::BytesPerElement(false);
|
||||
auto avail_elem = avail / per_elem;
|
||||
@ -576,7 +576,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
|
||||
|
||||
namespace {
|
||||
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
auto n = n_samples * n_features;
|
||||
std::vector<float> x;
|
||||
x.resize(n);
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
|
||||
namespace xgboost::data {
|
||||
void TestEquivalent(float sparsity) {
|
||||
Context ctx{MakeCUDACtx(0)};
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
|
||||
CudaArrayIterForTest iter{sparsity};
|
||||
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user