[EM] Avoid stream sync in quantile sketching. (#10765)

.
This commit is contained in:
Jiaming Yuan
2024-08-30 12:33:24 +08:00
committed by GitHub
parent 61dd854a52
commit 34d4ab455e
12 changed files with 313 additions and 313 deletions

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2022-2023 by XGBoost Contributors
* Copyright 2022-2024, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_ALGORITHM_CUH_
#define XGBOOST_COMMON_ALGORITHM_CUH_
@@ -258,5 +258,19 @@ void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
cuctx->Stream()));
}
template <typename InIt, typename OutIt, typename Predicate>
void CopyIf(CUDAContext const *cuctx, InIt in_first, InIt in_second, OutIt out_first,
Predicate pred) {
// We loop over batches because thrust::copy_if can't deal with sizes > 2^31
// See thrust issue #1302, XGBoost #6822
size_t constexpr kMaxCopySize = std::numeric_limits<int>::max() / 2;
size_t length = std::distance(in_first, in_second);
for (size_t offset = 0; offset < length; offset += kMaxCopySize) {
auto begin_input = in_first + offset;
auto end_input = in_first + std::min(offset + kMaxCopySize, length);
out_first = thrust::copy_if(cuctx->CTP(), begin_input, end_input, out_first, pred);
}
}
} // namespace xgboost::common
#endif // XGBOOST_COMMON_ALGORITHM_CUH_

View File

@@ -637,12 +637,11 @@ struct SegmentedUniqueReduceOp {
* \return Number of unique values in total.
*/
template <typename DerivedPolicy, typename KeyInIt, typename KeyOutIt, typename ValInIt,
typename ValOutIt, typename CompValue, typename CompKey>
size_t
SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
CompValue comp, CompKey comp_key=thrust::equal_to<size_t>{}) {
typename ValOutIt, typename CompValue, typename CompKey = thrust::equal_to<size_t>>
size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
CompValue comp, CompKey comp_key = thrust::equal_to<size_t>{}) {
using Key = thrust::pair<size_t, typename thrust::iterator_traits<ValInIt>::value_type>;
auto unique_key_it = dh::MakeTransformIterator<Key>(
thrust::make_counting_iterator(static_cast<size_t>(0)),
@@ -676,16 +675,6 @@ SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec
return n_uniques;
}
template <typename... Inputs,
std::enable_if_t<std::tuple_size<std::tuple<Inputs...>>::value == 7>
* = nullptr>
size_t SegmentedUnique(Inputs &&...inputs) {
dh::XGBCachingDeviceAllocator<char> alloc;
return SegmentedUnique(thrust::cuda::par(alloc),
std::forward<Inputs &&>(inputs)...,
thrust::equal_to<size_t>{});
}
/**
* \brief Unique by key for many groups of data. Has same constraint as `SegmentedUnique`.
*
@@ -793,21 +782,6 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
#endif
}
template <typename InIt, typename OutIt, typename Predicate>
void CopyIf(InIt in_first, InIt in_second, OutIt out_first, Predicate pred) {
// We loop over batches because thrust::copy_if can't deal with sizes > 2^31
// See thrust issue #1302, XGBoost #6822
size_t constexpr kMaxCopySize = std::numeric_limits<int>::max() / 2;
size_t length = std::distance(in_first, in_second);
XGBCachingDeviceAllocator<char> alloc;
for (size_t offset = 0; offset < length; offset += kMaxCopySize) {
auto begin_input = in_first + offset;
auto end_input = in_first + std::min(offset + kMaxCopySize, length);
out_first = thrust::copy_if(thrust::cuda::par(alloc), begin_input,
end_input, out_first, pred);
}
}
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
InclusiveScan(d_in, d_out, cub::Sum(), num_items);

View File

@@ -106,26 +106,27 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_ro
return std::min(sketch_batch_num_elements, kIntMax);
}
void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* sorted_entries) {
void SortByWeight(Context const* ctx, dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries) {
// Sort both entries and wegihts.
dh::XGBDeviceAllocator<char> alloc;
auto cuctx = ctx->CUDACtx();
CHECK_EQ(weights->size(), sorted_entries->size());
thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
weights->begin(), detail::EntryCompareOp());
thrust::sort_by_key(cuctx->TP(), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
detail::EntryCompareOp());
// Scan weights
dh::XGBCachingDeviceAllocator<char> caching;
thrust::inclusive_scan_by_key(
thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
cuctx->CTP(), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
weights->begin(),
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
}
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
Span<bst_idx_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
info.feature_types.SetDevice(device);
info.feature_types.SetDevice(ctx->Device());
auto d_feature_types = info.feature_types.ConstDeviceSpan();
CHECK(!d_feature_types.empty());
auto& column_sizes_scan = *p_column_sizes_scan;
@@ -142,30 +143,32 @@ void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst
auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
n_uniques = dh::SegmentedUnique(
column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
[=] __device__(Pair const& l, Pair const& r) {
Entry const& le = thrust::get<0>(l);
Entry const& re = thrust::get<0>(r);
if (le.index == re.index && IsCat(d_feature_types, le.index)) {
return le.fvalue == re.fvalue;
}
return false;
});
n_uniques =
dh::SegmentedUnique(ctx->CUDACtx()->CTP(), column_sizes_scan.data().get(),
column_sizes_scan.data().get() + column_sizes_scan.size(), val_in_it,
val_in_it + sorted_entries.size(), new_column_scan.data().get(),
val_out_it, [=] __device__(Pair const& l, Pair const& r) {
Entry const& le = thrust::get<0>(l);
Entry const& re = thrust::get<0>(r);
if (le.index == re.index && IsCat(d_feature_types, le.index)) {
return le.fvalue == re.fvalue;
}
return false;
});
p_sorted_weights->resize(n_uniques);
} else {
n_uniques = dh::SegmentedUnique(
column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
if (l.index == r.index) {
if (IsCat(d_feature_types, l.index)) {
return l.fvalue == r.fvalue;
}
}
return false;
});
n_uniques = dh::SegmentedUnique(ctx->CUDACtx()->CTP(), column_sizes_scan.data().get(),
column_sizes_scan.data().get() + column_sizes_scan.size(),
sorted_entries.begin(), sorted_entries.end(),
new_column_scan.data().get(), sorted_entries.begin(),
[=] __device__(Entry const& l, Entry const& r) {
if (l.index == r.index) {
if (IsCat(d_feature_types, l.index)) {
return l.fvalue == r.fvalue;
}
}
return false;
});
}
sorted_entries.resize(n_uniques);
@@ -189,7 +192,7 @@ void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst
}
});
// Turn size into ptr.
thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
thrust::exclusive_scan(ctx->CUDACtx()->CTP(), new_cuts_size.cbegin(), new_cuts_size.cend(),
d_cuts_ptr.data());
}
} // namespace detail
@@ -225,7 +228,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
std::size_t ridx = dh::SegmentId(row_ptrs, element_idx);
d_temp_weight[idx] = sample_weight[ridx + base_rowid];
});
detail::SortByWeight(&entry_weight, &sorted_entries);
detail::SortByWeight(ctx, &entry_weight, &sorted_entries);
} else {
thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(),
detail::EntryCompareOp());
@@ -238,13 +241,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
detail::GetColumnSizesScan(ctx->CUDACtx(), ctx->Device(), info.num_col_, num_cuts_per_feature,
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, p_weight,
&column_sizes_scan);
}
@@ -252,7 +255,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size());
// Add cuts into sketches
sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
h_cuts_ptr.back(), dh::ToSpan(entry_weight));
sorted_entries.clear();

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2023 by XGBoost contributors
* Copyright 2020-2024, XGBoost contributors
*
* \brief Front end and utilities for GPU based sketching. Works on sliding window
* instead of stream.
@@ -13,6 +13,8 @@
#include <cstddef> // for size_t
#include "../data/adapter.h" // for IsValidFunctor
#include "algorithm.cuh" // for CopyIf
#include "cuda_context.cuh" // for CUDAContext
#include "device_helpers.cuh"
#include "hist_util.h"
#include "quantile.cuh"
@@ -107,9 +109,10 @@ std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shar
* \param out_column_size Output buffer for the size of each column.
*/
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
void LaunchGetColumnSizeKernel(CUDAContext const* cuctx, DeviceOrd device,
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
Span<std::size_t> out_column_size) {
thrust::fill_n(cuctx->CTP(), dh::tbegin(out_column_size), out_column_size.size(), 0);
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
// Not strictly correct as we should use number of samples to determine the type of
@@ -135,17 +138,17 @@ void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
CHECK(!force_use_u64);
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, cuctx->Stream()}(
kernel, batch_iter, is_valid, out_column_size);
} else {
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, cuctx->Stream()}(
kernel, batch_iter, is_valid, out_column_size);
}
} else {
auto d_out_column_size = out_column_size;
dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
dh::LaunchN(batch_iter.size(), cuctx->Stream(), [=] __device__(size_t idx) {
auto e = batch_iter[idx];
if (is_valid(e)) {
atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
@@ -155,26 +158,26 @@ void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
}
template <typename BatchIt>
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
void GetColumnSizesScan(CUDAContext const* cuctx, DeviceOrd device, size_t num_columns,
std::size_t num_cuts_per_feature, IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid,
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
dh::caching_device_vector<size_t>* column_sizes_scan) {
column_sizes_scan->resize(num_columns + 1);
cuts_ptr->SetDevice(device);
cuts_ptr->Resize(num_columns + 1, 0);
dh::XGBCachingDeviceAllocator<char> alloc;
auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
LaunchGetColumnSizeKernel(cuctx, device, batch_iter, is_valid, d_column_sizes_scan);
// Calculate cuts CSC pointer
auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
column_sizes_scan->begin(), [=] __device__(size_t column_size) {
return thrust::min(num_cuts_per_feature, column_size);
});
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it,
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
column_sizes_scan->end(), column_sizes_scan->begin());
thrust::exclusive_scan(cuctx->CTP(), column_sizes_scan->begin(), column_sizes_scan->end(),
column_sizes_scan->begin());
}
inline size_t constexpr BytesPerElement(bool has_weight) {
@@ -215,9 +218,9 @@ size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
// Count the valid entries in each column and copy them out.
template <typename AdapterBatch, typename BatchIter>
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
float missing, size_t columns, size_t cuts_per_feature,
DeviceOrd device,
void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch,
BatchIter batch_iter, Range1d range, float missing, size_t columns,
size_t cuts_per_feature, DeviceOrd device,
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
dh::caching_device_vector<size_t>* column_sizes_scan,
dh::device_vector<Entry>* sorted_entries) {
@@ -229,19 +232,20 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
auto span = IterSpan{batch_iter + range.begin(), n};
data::IsValidFunctor is_valid(missing);
// Work out how many valid entries we have in each column
GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
GetColumnSizesScan(cuctx, device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
column_sizes_scan);
size_t num_valid = column_sizes_scan->back();
// Copy current subset of valid elements into temporary storage and sort
sorted_entries->resize(num_valid);
dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
is_valid);
CopyIf(cuctx, entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
is_valid);
}
void SortByWeight(dh::device_vector<float>* weights,
void SortByWeight(Context const* ctx, dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
Span<bst_idx_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -278,10 +282,9 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
}
template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
DeviceOrd device, size_t columns, size_t begin, size_t end,
float missing, SketchContainer *sketch_container,
int num_cuts) {
void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInfo const& info,
size_t columns, size_t begin, size_t end, float missing,
SketchContainer* sketch_container, int num_cuts) {
// Copy current subset of valid elements into temporary storage and sort
dh::device_vector<Entry> sorted_entries;
dh::caching_device_vector<size_t> column_sizes_scan;
@@ -289,54 +292,45 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); });
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
cuts_ptr.SetDevice(device);
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
columns, num_cuts, device,
&cuts_ptr,
&column_sizes_scan,
&sorted_entries);
dh::XGBDeviceAllocator<char> alloc;
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), detail::EntryCompareOp());
cuts_ptr.SetDevice(ctx->Device());
CUDAContext const* cuctx = ctx->CUDACtx();
detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns, num_cuts,
ctx->Device(), &cuts_ptr, &column_sizes_scan, &sorted_entries);
thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp());
if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, nullptr,
&column_sizes_scan);
}
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
auto const &h_cuts_ptr = cuts_ptr.HostVector();
// Extract the cuts from all columns concurrently
sketch_container->Push(dh::ToSpan(sorted_entries),
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
h_cuts_ptr.back());
sorted_entries.clear();
sorted_entries.shrink_to_fit();
}
template <typename Batch>
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
int num_cuts_per_feature,
bool is_ranking, float missing, DeviceOrd device,
size_t columns, size_t begin, size_t end,
SketchContainer *sketch_container) {
dh::XGBCachingDeviceAllocator<char> alloc;
void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo const& info,
int num_cuts_per_feature, bool is_ranking, float missing,
DeviceOrd device, size_t columns, size_t begin, size_t end,
SketchContainer* sketch_container) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
info.weights_.SetDevice(device);
auto weights = info.weights_.ConstDeviceSpan();
auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); });
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); });
auto cuctx = ctx->CUDACtx();
dh::device_vector<Entry> sorted_entries;
dh::caching_device_vector<size_t> column_sizes_scan;
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
detail::MakeEntriesFromAdapter(batch, batch_iter,
{begin, end}, missing,
columns, num_cuts_per_feature, device,
&cuts_ptr,
&column_sizes_scan,
detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns,
num_cuts_per_feature, device, &cuts_ptr, &column_sizes_scan,
&sorted_entries);
data::IsValidFunctor is_valid(missing);
@@ -355,7 +349,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
return weights[group_idx];
});
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
auto retit = thrust::copy_if(cuctx->CTP(),
weight_iter + begin, weight_iter + end,
batch_iter + begin,
d_temp_weights.data(), // output
@@ -368,7 +362,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
[=]__device__(size_t idx) -> float {
return weights[batch.GetElement(idx).row_idx];
});
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
auto retit = thrust::copy_if(cuctx->CTP(),
weight_iter + begin, weight_iter + end,
batch_iter + begin,
d_temp_weights.data(), // output
@@ -376,11 +370,11 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
}
detail::SortByWeight(&temp_weights, &sorted_entries);
detail::SortByWeight(ctx, &temp_weights, &sorted_entries);
if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, &temp_weights,
&column_sizes_scan);
}
@@ -388,8 +382,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
// Extract cuts
sketch_container->Push(dh::ToSpan(sorted_entries),
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
h_cuts_ptr.back(), dh::ToSpan(temp_weights));
sorted_entries.clear();
sorted_entries.shrink_to_fit();
@@ -407,8 +400,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
* testing.
*/
template <typename Batch>
void AdapterDeviceSketch(Batch batch, int num_bins,
MetaInfo const& info,
void AdapterDeviceSketch(Context const* ctx, Batch batch, int num_bins, MetaInfo const& info,
float missing, SketchContainer* sketch_container,
size_t sketch_batch_num_elements = 0) {
size_t num_rows = batch.NumRows();
@@ -419,27 +411,24 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
if (weighted) {
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
device.ordinal, num_cuts_per_feature, true);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
ProcessWeightedSlidingWindow(batch, info,
num_cuts_per_feature,
HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
sketch_container);
ProcessWeightedSlidingWindow(ctx, batch, info, num_cuts_per_feature,
HostSketchContainer::UseGroup(info), missing, device, num_cols,
begin, end, sketch_container);
}
} else {
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
device.ordinal, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
sketch_container, num_cuts_per_feature);
ProcessSlidingWindow(ctx, batch, info, num_cols, begin, end, missing, sketch_container,
num_cuts_per_feature);
}
}
}

View File

@@ -18,6 +18,8 @@
#include "../collective/communicator-inl.h" // for GetWorldSize, GetRank
#include "categorical.h"
#include "common.h"
#include "cuda_context.cuh" // for CUDAContext
#include "cuda_rt_utils.h" // for SetDevice
#include "device_helpers.cuh"
#include "hist_util.h"
#include "quantile.cuh"
@@ -117,6 +119,7 @@ void CopyTo(Span<T> out, Span<U> src) {
// Compute the merge path.
common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
Context const* ctx,
Span<SketchEntry const> const &d_x, Span<bst_idx_t const> const &x_ptr,
Span<SketchEntry const> const &d_y, Span<bst_idx_t const> const &y_ptr,
Span<SketchEntry> out, Span<bst_idx_t> out_ptr) {
@@ -142,13 +145,12 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
auto y_merge_val_it =
thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));
dh::XGBCachingDeviceAllocator<Tuple> alloc;
static_assert(sizeof(Tuple) == sizeof(SketchEntry));
// We reuse the memory for storing merge path.
common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
// Determine the merge path, 0 if element is from x, 1 if it's from y.
thrust::merge_by_key(
thrust::cuda::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(),
ctx->CUDACtx()->CTP(), x_merge_key_it, x_merge_key_it + d_x.size(),
y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it,
y_merge_val_it, thrust::make_discard_iterator(), merge_path.data(),
[=] __device__(auto const &l, auto const &r) -> bool {
@@ -163,10 +165,9 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// Compute output ptr
auto transform_it =
thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
thrust::transform(
thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
out_ptr.data(),
[] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); });
thrust::transform(ctx->CUDACtx()->CTP(), transform_it, transform_it + x_ptr.size(),
out_ptr.data(),
[] __device__(auto const &t) { return thrust::get<0>(t) + thrust::get<1>(t); });
// 0^th is the indicator, 1^th is placeholder
auto get_ind = []XGBOOST_DEVICE(Tuple const& t) { return thrust::get<0>(t); };
@@ -194,7 +195,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// is landed into output as the first element in merge result. The scan result is the
// subscript of x and y.
thrust::exclusive_scan_by_key(
thrust::cuda::par(alloc), scan_key_it, scan_key_it + merge_path.size(),
ctx->CUDACtx()->CTP(), scan_key_it, scan_key_it + merge_path.size(),
scan_val_it, merge_path.data(),
thrust::make_tuple<uint64_t, uint64_t>(0ul, 0ul),
thrust::equal_to<size_t>{},
@@ -209,18 +210,17 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// summary does the output element come from) result by definition of merged rank. So we
// run it in 2 passes to obtain the merge path and then customize the standard merge
// algorithm.
void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
Span<bst_idx_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_idx_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_idx_t> out_ptr) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size());
CHECK_EQ(y_ptr.size(), out_ptr.size());
auto d_merge_path = MergePath(d_x, x_ptr, d_y, y_ptr, out, out_ptr);
auto d_merge_path = MergePath(ctx, d_x, x_ptr, d_y, y_ptr, out, out_ptr);
auto d_out = out;
dh::LaunchN(d_out.size(), [=] __device__(size_t idx) {
dh::LaunchN(d_out.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
auto column_id = dh::SegmentId(out_ptr, idx);
idx -= out_ptr[column_id];
@@ -307,10 +307,9 @@ void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
});
}
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) {
dh::safe_cuda(cudaSetDevice(device_.ordinal));
void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
common::SetDevice(device_.ordinal);
Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts;
bool first_window = this->Current().empty();
@@ -346,12 +345,12 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
}; // NOLINT
PruneImpl<Entry>(cuts_ptr, entries, columns_ptr, ft, out, to_sketch_entry);
}
auto n_uniques = this->ScanInput(out, cuts_ptr);
auto n_uniques = this->ScanInput(ctx, out, cuts_ptr);
if (!first_window) {
CHECK_EQ(this->columns_ptr_.Size(), cuts_ptr.size());
out = out.subspan(0, n_uniques);
this->Merge(cuts_ptr, out);
this->Merge(ctx, cuts_ptr, out);
this->FixError();
} else {
this->Current().resize(n_uniques);
@@ -363,7 +362,8 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
}
}
size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in) {
size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
Span<OffsetT> d_columns_ptr_in) {
/* There are 2 types of duplication. First is duplicated feature values, which comes
* from user input data. Second is duplicated sketching entries, which is generated by
* pruning or merging. We preserve the first type and remove the second type.
@@ -371,7 +371,6 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_.ordinal));
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
dh::XGBCachingDeviceAllocator<char> alloc;
auto key_it = dh::MakeTransformIterator<size_t>(
thrust::make_reverse_iterator(thrust::make_counting_iterator(entries.size())),
@@ -381,7 +380,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
// Reverse scan to accumulate weights into first duplicated element on left.
auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
thrust::inclusive_scan_by_key(
thrust::cuda::par(alloc), key_it, key_it + entries.size(),
ctx->CUDACtx()->CTP(), key_it, key_it + entries.size(),
val_it, val_it,
thrust::equal_to<size_t>{},
[] __device__(SketchEntry const &r, SketchEntry const &l) {
@@ -396,18 +395,18 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
auto d_columns_ptr_out = columns_ptr_b_.DeviceSpan();
// thrust unique_by_key preserves the first element.
auto n_uniques = dh::SegmentedUnique(
d_columns_ptr_in.data(),
d_columns_ptr_in.data() + d_columns_ptr_in.size(), entries.data(),
entries.data() + entries.size(), d_columns_ptr_out.data(), entries.data(),
detail::SketchUnique{});
auto n_uniques =
dh::SegmentedUnique(ctx->CUDACtx()->CTP(), d_columns_ptr_in.data(),
d_columns_ptr_in.data() + d_columns_ptr_in.size(), entries.data(),
entries.data() + entries.size(), d_columns_ptr_out.data(), entries.data(),
detail::SketchUnique{});
CopyTo(d_columns_ptr_in, d_columns_ptr_out);
timer_.Stop(__func__);
return n_uniques;
}
void SketchContainer::Prune(size_t to) {
void SketchContainer::Prune(Context const* ctx, std::size_t to) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_.ordinal));
@@ -438,19 +437,19 @@ void SketchContainer::Prune(size_t to) {
this->columns_ptr_.Copy(columns_ptr_b_);
this->Alternate();
this->Unique();
this->Unique(ctx);
timer_.Stop(__func__);
}
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) {
dh::safe_cuda(cudaSetDevice(device_.ordinal));
common::SetDevice(device_.ordinal);
timer_.Start(__func__);
if (this->Current().size() == 0) {
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
CHECK_EQ(this->columns_ptr_.HostVector().size(), d_that_columns_ptr.size());
CHECK_EQ(columns_ptr_.Size(), num_columns_ + 1);
thrust::copy(thrust::device, d_that_columns_ptr.data(),
thrust::copy(ctx->CUDACtx()->CTP(), d_that_columns_ptr.data(),
d_that_columns_ptr.data() + d_that_columns_ptr.size(),
this->columns_ptr_.DevicePointer());
auto total = this->columns_ptr_.HostVector().back();
@@ -463,7 +462,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
this->Other().resize(this->Current().size() + that.size());
CHECK_EQ(d_that_columns_ptr.size(), this->columns_ptr_.Size());
MergeImpl(device_, this->Data(), this->ColumnsPtr(), that, d_that_columns_ptr,
MergeImpl(ctx, this->Data(), this->ColumnsPtr(), that, d_that_columns_ptr,
dh::ToSpan(this->Other()), columns_ptr_b_.DeviceSpan());
this->columns_ptr_.Copy(columns_ptr_b_);
CHECK_EQ(this->columns_ptr_.Size(), num_columns_ + 1);
@@ -471,7 +470,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
if (this->HasCategorical()) {
auto d_feature_types = this->FeatureTypes().ConstDeviceSpan();
this->Unique([d_feature_types] __device__(size_t l_fidx, size_t r_fidx) {
this->Unique(ctx, [d_feature_types] __device__(size_t l_fidx, size_t r_fidx) {
return l_fidx == r_fidx && IsCat(d_feature_types, l_fidx);
});
}
@@ -517,7 +516,7 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
SafeColl(rc);
bst_idx_t intermediate_num_cuts =
std::min(global_sum_rows, static_cast<size_t>(num_bins_ * kFactor));
this->Prune(intermediate_num_cuts);
this->Prune(ctx, intermediate_num_cuts);
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
CHECK_EQ(d_columns_ptr.size(), num_columns_ + 1);
@@ -570,9 +569,8 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
for (size_t i = 0; i < allworkers.size(); ++i) {
auto worker = allworkers[i];
auto worker_ptr =
dh::ToSpan(gathered_ptrs)
.subspan(i * d_columns_ptr.size(), d_columns_ptr.size());
new_sketch.Merge(worker_ptr, worker);
dh::ToSpan(gathered_ptrs).subspan(i * d_columns_ptr.size(), d_columns_ptr.size());
new_sketch.Merge(ctx, worker_ptr, worker);
new_sketch.FixError();
}
@@ -602,7 +600,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
this->AllReduce(ctx, is_column_split);
// Prune to final number of bins.
this->Prune(num_bins_ + 1);
this->Prune(ctx, num_bins_ + 1);
this->FixError();
// Set up inputs
@@ -624,7 +622,6 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
std::vector<SketchEntry> max_values;
float max_cat{-1.f};
if (has_categorical_) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto key_it = dh::MakeTransformIterator<bst_feature_t>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) -> bst_feature_t {
return dh::SegmentId(d_in_columns_ptr, i);
@@ -651,7 +648,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
auto new_end = thrust::reduce_by_key(
thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
ctx->CUDACtx()->CTP(), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
d_max_keys.erase(new_end.first, d_max_keys.end());
@@ -661,7 +658,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
SketchEntry default_entry{};
dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
default_entry);
thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
thrust::scatter(ctx->CUDACtx()->CTP(), d_max_values.begin(), d_max_values.end(),
d_max_keys.begin(), d_max_results.begin());
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
auto max_it = MakeIndexTransformIter([&](auto i) {

View File

@@ -7,6 +7,7 @@
#include <thrust/logical.h> // for any_of
#include "categorical.h"
#include "cuda_context.cuh" // for CUDAContext
#include "device_helpers.cuh"
#include "error_msg.h" // for InvalidMaxBin
#include "quantile.h"
@@ -127,7 +128,7 @@ class SketchContainer {
/* \brief Whether the predictor matrix contains categorical features. */
bool HasCategorical() const { return has_categorical_; }
/* \brief Accumulate weights of duplicated entries in input. */
size_t ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
size_t ScanInput(Context const* ctx, Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
/* Fix rounding error and re-establish invariance. The error is mostly generated by the
* addition inside `RMinNext` and subtraction in `RMaxPrev`. */
void FixError();
@@ -140,19 +141,18 @@ class SketchContainer {
* \param total_cuts Total number of cuts, equal to the back of cuts_ptr.
* \param weights (optional) data weights.
*/
void Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, size_t total_cuts,
Span<float> weights = {});
void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
/* \brief Prune the quantile structure.
*
* \param to The maximum size of pruned quantile. If the size of quantile
* structure is already less than `to`, then no operation is performed.
*/
void Prune(size_t to);
void Prune(Context const* ctx, size_t to);
/* \brief Merge another set of sketch.
* \param that columns of other.
*/
void Merge(Span<OffsetT const> that_columns_ptr,
void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
Span<SketchEntry const> that);
/* \brief Merge quantiles from other GPU workers. */
@@ -175,7 +175,7 @@ class SketchContainer {
/* \brief Removes all the duplicated elements in quantile structure. */
template <typename KeyComp = thrust::equal_to<size_t>>
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_.ordinal));
this->columns_ptr_.SetDevice(device_);
@@ -185,14 +185,12 @@ class SketchContainer {
HostDeviceVector<OffsetT> scan_out(d_column_scan.size());
scan_out.SetDevice(device_);
auto d_scan_out = scan_out.DeviceSpan();
dh::XGBCachingDeviceAllocator<char> alloc;
d_column_scan = this->columns_ptr_.DeviceSpan();
size_t n_uniques = dh::SegmentedUnique(
thrust::cuda::par(alloc), d_column_scan.data(),
d_column_scan.data() + d_column_scan.size(), entries.data(),
entries.data() + entries.size(), scan_out.DevicePointer(),
entries.data(), detail::SketchUnique{}, key_comp);
ctx->CUDACtx()->CTP(), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(),
entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(),
detail::SketchUnique{}, key_comp);
this->columns_ptr_.Copy(scan_out);
CHECK(!this->columns_ptr_.HostCanRead());

View File

@@ -11,6 +11,7 @@
#include <type_traits> // for invoke_result_t, declval
#include <vector> // for vector
#include "../common/cuda_rt_utils.h" // for xgboost_NVTX_FN_RANGE
#include "adapter.h"
#include "xgboost/c_api.h"
#include "xgboost/context.h"
@@ -36,6 +37,8 @@ class DataIterProxy {
DataIterProxy& operator=(DataIterProxy const& that) = default;
[[nodiscard]] bool Next() {
xgboost_NVTX_FN_RANGE();
bool ret = !!next_(iter_);
if (!ret) {
return ret;

View File

@@ -30,14 +30,13 @@ void MakeSketches(Context const* ctx,
ExternalDataInfo* p_ext_info) {
xgboost_NVTX_FN_RANGE();
CUDAContext const* cuctx = ctx->CUDACtx();
std::unique_ptr<common::SketchContainer> sketch;
auto& ext_info = *p_ext_info;
do {
// We use do while here as the first batch is fetched in ctor
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
common::SetDevice(dh::GetDevice(ctx).ordinal);
if (ext_info.n_features == 0) {
ext_info.n_features = data::BatchColumns(proxy);
auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
@@ -55,7 +54,16 @@ void MakeSketches(Context const* ctx,
}
proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
cuda_impl::Dispatch(proxy, [&](auto const& value) {
common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, sketch.get());
// Workaround empty input with CPU ctx.
Context new_ctx;
Context const* p_ctx;
if (ctx->IsCUDA()) {
p_ctx = ctx;
} else {
new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
p_ctx = &new_ctx;
}
common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get());
});
}
auto batch_rows = data::BatchSamples(proxy);
@@ -66,7 +74,7 @@ void MakeSketches(Context const* ctx,
std::max(ext_info.row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, dh::GetDevice(ctx), missing);
}));
ext_info.nnz += thrust::reduce(cuctx->CTP(), row_counts.begin(), row_counts.end());
ext_info.nnz += thrust::reduce(ctx->CUDACtx()->CTP(), row_counts.begin(), row_counts.end());
ext_info.n_batches++;
ext_info.base_rows.push_back(batch_rows);
} while (iter->Next());
@@ -77,7 +85,7 @@ void MakeSketches(Context const* ctx,
ext_info.base_rows.begin());
// Get reference
dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
common::SetDevice(dh::GetDevice(ctx).ordinal);
if (!ref) {
sketch->MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
} else {

View File

@@ -11,6 +11,7 @@
#include "../common/device_helpers.cuh"
#include "../common/error_msg.h" // for InfInData
#include "../common/algorithm.cuh" // for CopyIf
#include "device_adapter.cuh" // for NoInfInData
namespace xgboost::data {
@@ -27,16 +28,15 @@ struct COOToEntryOp {
// Here the data is already correctly ordered and simply needs to be compacted
// to remove missing data
template <typename AdapterBatchT>
void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
float missing) {
void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data, float missing) {
auto counting = thrust::make_counting_iterator(0llu);
dh::XGBCachingDeviceAllocator<char> alloc;
COOToEntryOp<decltype(batch)> transform_op{batch};
thrust::transform_iterator<decltype(transform_op), decltype(counting)>
transform_iter(counting, transform_op);
thrust::transform_iterator<decltype(transform_op), decltype(counting)> transform_iter(
counting, transform_op);
auto begin_output = thrust::device_pointer_cast(data.data());
dh::CopyIf(transform_iter, transform_iter + batch.Size(), begin_output,
IsValidFunctor(missing));
auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
common::CopyIf(ctx.CUDACtx(), transform_iter, transform_iter + batch.Size(), begin_output,
IsValidFunctor(missing));
}
template <typename AdapterBatchT>