Handle duplicated values in sketching. (#6178)
* Accumulate weights in duplicated values. * Fix device id in iterative dmatrix.
This commit is contained in:
@@ -27,6 +27,7 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <tuple>
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
@@ -1059,14 +1060,14 @@ struct SegmentedUniqueReduceOp {
|
||||
*
|
||||
* \return Number of unique values in total.
|
||||
*/
|
||||
template <typename KeyInIt, typename KeyOutIt, typename ValInIt,
|
||||
template <typename DerivedPolicy, typename KeyInIt, typename KeyOutIt, typename ValInIt,
|
||||
typename ValOutIt, typename Comp>
|
||||
size_t
|
||||
SegmentedUnique(KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
|
||||
SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
|
||||
KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
|
||||
ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
|
||||
Comp comp) {
|
||||
using Key = thrust::pair<size_t, typename thrust::iterator_traits<ValInIt>::value_type>;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
auto unique_key_it = dh::MakeTransformIterator<Key>(
|
||||
thrust::make_counting_iterator(static_cast<size_t>(0)),
|
||||
[=] __device__(size_t i) {
|
||||
@@ -1083,7 +1084,7 @@ SegmentedUnique(KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt v
|
||||
thrust::make_discard_iterator(),
|
||||
detail::SegmentedUniqueReduceOp<Key, KeyOutIt>{key_segments_out});
|
||||
auto uniques_ret = thrust::unique_by_key_copy(
|
||||
thrust::cuda::par(alloc), unique_key_it, unique_key_it + n_inputs,
|
||||
exec, unique_key_it, unique_key_it + n_inputs,
|
||||
val_first, reduce_it, val_out,
|
||||
[=] __device__(Key const &l, Key const &r) {
|
||||
if (l.first == r.first) {
|
||||
@@ -1094,8 +1095,16 @@ SegmentedUnique(KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt v
|
||||
});
|
||||
auto n_uniques = uniques_ret.second - val_out;
|
||||
CHECK_LE(n_uniques, n_inputs);
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc), key_segments_out,
|
||||
thrust::exclusive_scan(exec, key_segments_out,
|
||||
key_segments_out + segments_len, key_segments_out, 0);
|
||||
return n_uniques;
|
||||
}
|
||||
|
||||
template <typename... Inputs,
|
||||
std::enable_if_t<std::tuple_size<std::tuple<Inputs...>>::value == 7>
|
||||
* = nullptr>
|
||||
size_t SegmentedUnique(Inputs &&...inputs) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
|
||||
}
|
||||
} // namespace dh
|
||||
|
||||
@@ -269,7 +269,7 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
|
||||
&cuts_ptr, &column_sizes_scan);
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
auto d_cuts_ptr = cuts_ptr.ConstDeviceSpan();
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
|
||||
// Extract cuts
|
||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||
|
||||
@@ -153,7 +153,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
||||
sorted_entries.end(), detail::EntryCompareOp());
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
auto d_cuts_ptr = cuts_ptr.ConstDeviceSpan();
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
// Extract the cuts from all columns concurrently
|
||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
||||
@@ -224,7 +224,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
detail::SortByWeight(&temp_weights, &sorted_entries);
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
auto d_cuts_ptr = cuts_ptr.ConstDeviceSpan();
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
|
||||
// Extract cuts
|
||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||
|
||||
@@ -104,9 +104,10 @@ void PruneImpl(int device,
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CopyTo(Span<T> out, Span<T const> src) {
|
||||
template <typename T, typename U>
|
||||
void CopyTo(Span<T> out, Span<U> src) {
|
||||
CHECK_EQ(out.size(), src.size());
|
||||
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
|
||||
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
|
||||
out.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
@@ -307,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
}
|
||||
|
||||
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT const> cuts_ptr,
|
||||
common::Span<OffsetT> cuts_ptr,
|
||||
size_t total_cuts, Span<float> weights) {
|
||||
Span<SketchEntry> out;
|
||||
dh::device_vector<SketchEntry> cuts;
|
||||
@@ -346,12 +347,15 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
PruneImpl<Entry>(device_, cuts_ptr, entries, columns_ptr, ft, out,
|
||||
to_sketch_entry);
|
||||
}
|
||||
auto n_uniques = this->ScanInput(out, cuts_ptr);
|
||||
|
||||
if (!first_window) {
|
||||
CHECK_EQ(this->columns_ptr_.Size(), cuts_ptr.size());
|
||||
out = out.subspan(0, n_uniques);
|
||||
this->Merge(cuts_ptr, out);
|
||||
this->FixError();
|
||||
} else {
|
||||
this->Current().resize(n_uniques);
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
this->columns_ptr_.Resize(cuts_ptr.size());
|
||||
|
||||
@@ -360,6 +364,49 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in) {
|
||||
/* There are 2 types of duplication. First is duplicated feature values, which comes
|
||||
* from user input data. Second is duplicated sketching entries, which is generated by
|
||||
* prunning or merging. We preserve the first type and remove the second type.
|
||||
*/
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
auto key_it = dh::MakeTransformIterator<size_t>(
|
||||
thrust::make_reverse_iterator(thrust::make_counting_iterator(entries.size())),
|
||||
[=] __device__(size_t idx) {
|
||||
return dh::SegmentId(d_columns_ptr_in, idx);
|
||||
});
|
||||
// Reverse scan to accumulate weights into first duplicated element on left.
|
||||
auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
|
||||
thrust::inclusive_scan_by_key(
|
||||
thrust::cuda::par(alloc), key_it, key_it + entries.size(),
|
||||
val_it, val_it,
|
||||
thrust::equal_to<size_t>{},
|
||||
[] __device__(SketchEntry const &r, SketchEntry const &l) {
|
||||
// Only accumulate for the first type of duplication.
|
||||
if (l.value - r.value == 0 && l.rmin - r.rmin != 0) {
|
||||
auto w = l.wmin + r.wmin;
|
||||
SketchEntry v{l.rmin, l.rmin + w, w, l.value};
|
||||
return v;
|
||||
}
|
||||
return l;
|
||||
});
|
||||
|
||||
auto d_columns_ptr_out = columns_ptr_b_.DeviceSpan();
|
||||
// thrust unique_by_key preserves the first element.
|
||||
auto n_uniques = dh::SegmentedUnique(
|
||||
d_columns_ptr_in.data(),
|
||||
d_columns_ptr_in.data() + d_columns_ptr_in.size(), entries.data(),
|
||||
entries.data() + entries.size(), d_columns_ptr_out.data(), entries.data(),
|
||||
detail::SketchUnique{});
|
||||
CopyTo(d_columns_ptr_in, d_columns_ptr_out);
|
||||
|
||||
timer_.Stop(__func__);
|
||||
return n_uniques;
|
||||
}
|
||||
|
||||
size_t SketchContainer::Unique() {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
@@ -389,7 +436,6 @@ void SketchContainer::Prune(size_t to) {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
this->Unique();
|
||||
OffsetT to_total = 0;
|
||||
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
||||
h_columns_ptr[0] = to_total;
|
||||
@@ -417,6 +463,8 @@ void SketchContainer::Prune(size_t to) {
|
||||
out, no_op);
|
||||
this->columns_ptr_.Copy(columns_ptr_b_);
|
||||
this->Alternate();
|
||||
|
||||
this->Unique();
|
||||
timer_.Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -447,6 +495,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
this->columns_ptr_.Copy(columns_ptr_b_);
|
||||
CHECK_EQ(this->columns_ptr_.Size(), num_columns_ + 1);
|
||||
this->Alternate();
|
||||
|
||||
timer_.Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -558,7 +607,6 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
|
||||
|
||||
// Prune to final number of bins.
|
||||
this->Prune(num_bins_ + 1);
|
||||
this->Unique();
|
||||
this->FixError();
|
||||
|
||||
// Set up inputs
|
||||
|
||||
@@ -106,6 +106,8 @@ class SketchContainer {
|
||||
}
|
||||
/* \brief Return GPU ID for this container. */
|
||||
int32_t DeviceIdx() const { return device_; }
|
||||
/* \brief Accumulate weights of duplicated entries in input. */
|
||||
size_t ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
|
||||
/* \brief Removes all the duplicated elements in quantile structure. */
|
||||
size_t Unique();
|
||||
/* Fix rounding error and re-establish invariance. The error is mostly generated by the
|
||||
@@ -121,7 +123,7 @@ class SketchContainer {
|
||||
* \param weights (optional) data weights.
|
||||
*/
|
||||
void Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT const> cuts_ptr, size_t total_cuts,
|
||||
common::Span<OffsetT> cuts_ptr, size_t total_cuts,
|
||||
Span<float> weights = {});
|
||||
/* \brief Prune the quantile structure.
|
||||
*
|
||||
|
||||
@@ -63,15 +63,17 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
||||
size_t accumulated_rows = 0;
|
||||
bst_feature_t cols = 0;
|
||||
int32_t device = GenericParameter::kCpuId;
|
||||
int32_t current_device_;
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device_));
|
||||
int32_t current_device;
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
auto get_device = [&]() -> int32_t {
|
||||
int32_t d = GenericParameter::kCpuId ? current_device_ : device;
|
||||
int32_t d = (device == GenericParameter::kCpuId) ? current_device : device;
|
||||
CHECK_NE(d, GenericParameter::kCpuId);
|
||||
return d;
|
||||
};
|
||||
|
||||
while (iter.Next()) {
|
||||
device = proxy->DeviceIdx();
|
||||
CHECK_LT(device, common::AllVisibleGPUs());
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
if (cols == 0) {
|
||||
cols = num_cols();
|
||||
|
||||
@@ -66,6 +66,9 @@ class DMatrixProxy : public DMatrix {
|
||||
} else {
|
||||
this->FromCudaArray(interface_str);
|
||||
}
|
||||
if (this->info_.num_row_ == 0) {
|
||||
this->device_ = GenericParameter::kCpuId;
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user