Loop over thrust::reduce. (#6229)
* Check input chunk size of dqdm. * Add doc for current limitation.
This commit is contained in:
parent
734a911a26
commit
bed7ae4083
@ -167,7 +167,7 @@ or in R:
|
|||||||
|
|
||||||
Will print out something similiar to (not actual output as it's too long for demonstration):
|
Will print out something similiar to (not actual output as it's too long for demonstration):
|
||||||
|
|
||||||
.. code-block:: json
|
.. code-block:: javascript
|
||||||
|
|
||||||
{
|
{
|
||||||
"Learner": {
|
"Learner": {
|
||||||
|
|||||||
@ -871,6 +871,8 @@ class DeviceQuantileDMatrix(DMatrix):
|
|||||||
|
|
||||||
.. versionadded:: 1.1.0
|
.. versionadded:: 1.1.0
|
||||||
|
|
||||||
|
Known limitation:
|
||||||
|
The data size (rows * cols) can not exceed 2 ** 31 - 1000
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data, label=None, weight=None, # pylint: disable=W0231
|
def __init__(self, data, label=None, weight=None, # pylint: disable=W0231
|
||||||
|
|||||||
@ -509,6 +509,10 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
|
|||||||
max_bin: Number of bins for histogram construction.
|
max_bin: Number of bins for histogram construction.
|
||||||
|
|
||||||
|
|
||||||
|
Know issue:
|
||||||
|
The size of each chunk (rows * cols for a single dask chunk/partition) can
|
||||||
|
not exceed 2 ** 31 - 1000
|
||||||
|
|
||||||
'''
|
'''
|
||||||
def __init__(self, client,
|
def __init__(self, client,
|
||||||
data,
|
data,
|
||||||
|
|||||||
@ -1132,4 +1132,21 @@ size_t SegmentedUnique(Inputs &&...inputs) {
|
|||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
|
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Policy, typename InputIt, typename Init, typename Func>
|
||||||
|
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
|
||||||
|
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
|
||||||
|
size_t size = std::distance(first, second);
|
||||||
|
using Ty = std::remove_cv_t<Init>;
|
||||||
|
Ty aggregate = init;
|
||||||
|
for (size_t offset = 0; offset < size; offset += kLimit) {
|
||||||
|
auto begin_it = first + offset;
|
||||||
|
auto end_it = first + std::min(offset + kLimit, size);
|
||||||
|
size_t batch_size = std::distance(begin_it, end_it);
|
||||||
|
CHECK_LE(batch_size, size);
|
||||||
|
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
|
||||||
|
aggregate = reduce_op(aggregate, ret);
|
||||||
|
}
|
||||||
|
return aggregate;
|
||||||
|
}
|
||||||
} // namespace dh
|
} // namespace dh
|
||||||
|
|||||||
@ -221,7 +221,7 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
size_t row_stride = thrust::reduce(
|
size_t row_stride = dh::Reduce(
|
||||||
thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||||
thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
|
thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
|
||||||
thrust::maximum<size_t>());
|
thrust::maximum<size_t>());
|
||||||
|
|||||||
@ -206,6 +206,14 @@ void CopyDataToEllpack(const AdapterBatchT& batch, EllpackPageImpl* dst,
|
|||||||
WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
|
WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
|
||||||
out(discard, functor);
|
out(discard, functor);
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
// 1000 as a safe factor for inclusive_scan, otherwise it might generate overflow and
|
||||||
|
// lead to oom error.
|
||||||
|
// or:
|
||||||
|
// after reduction step 2: cudaErrorInvalidConfiguration: invalid configuration argument
|
||||||
|
// https://github.com/NVIDIA/thrust/issues/1299
|
||||||
|
CHECK_LE(batch.Size(), std::numeric_limits<int32_t>::max() - 1000)
|
||||||
|
<< "Known limitation, size (rows * cols) of quantile based DMatrix "
|
||||||
|
"cannot exceed the limit of 32-bit integer.";
|
||||||
thrust::inclusive_scan(thrust::cuda::par(alloc), key_value_index_iter,
|
thrust::inclusive_scan(thrust::cuda::par(alloc), key_value_index_iter,
|
||||||
key_value_index_iter + batch.Size(), out,
|
key_value_index_iter + batch.Size(), out,
|
||||||
[=] __device__(Tuple a, Tuple b) {
|
[=] __device__(Tuple a, Tuple b) {
|
||||||
|
|||||||
@ -53,7 +53,7 @@ struct Pair {
|
|||||||
GradientPair first;
|
GradientPair first;
|
||||||
GradientPair second;
|
GradientPair second;
|
||||||
};
|
};
|
||||||
XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
|
__host__ XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
|
||||||
return {lhs.first + rhs.first, lhs.second + rhs.second};
|
return {lhs.first + rhs.first, lhs.second + rhs.second};
|
||||||
}
|
}
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
@ -86,7 +86,7 @@ GradientSumT CreateRoundingFactor(common::Span<GradientPair const> gpair) {
|
|||||||
thrust::device_ptr<GradientPair const> gpair_end {gpair.data() + gpair.size()};
|
thrust::device_ptr<GradientPair const> gpair_end {gpair.data() + gpair.size()};
|
||||||
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
|
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
|
||||||
auto end = thrust::make_transform_iterator(gpair_end, Clip());
|
auto end = thrust::make_transform_iterator(gpair_end, Clip());
|
||||||
Pair p = thrust::reduce(thrust::cuda::par(alloc), beg, end, Pair{});
|
Pair p = dh::Reduce(thrust::cuda::par(alloc), beg, end, Pair{}, thrust::plus<Pair>{});
|
||||||
GradientPair positive_sum {p.first}, negative_sum {p.second};
|
GradientPair positive_sum {p.first}, negative_sum {p.second};
|
||||||
|
|
||||||
auto histogram_rounding = GradientSumT {
|
auto histogram_rounding = GradientSumT {
|
||||||
|
|||||||
@ -642,10 +642,11 @@ struct GPUHistMakerDevice {
|
|||||||
ExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
|
ExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
|
||||||
constexpr bst_node_t kRootNIdx = 0;
|
constexpr bst_node_t kRootNIdx = 0;
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
GradientPair root_sum = thrust::reduce(
|
GradientPair root_sum = dh::Reduce(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
thrust::device_ptr<GradientPair const>(gpair.data()),
|
thrust::device_ptr<GradientPair const>(gpair.data()),
|
||||||
thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()));
|
thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()),
|
||||||
|
GradientPair{}, thrust::plus<GradientPair>{});
|
||||||
rabit::Allreduce<rabit::op::Sum, float>(reinterpret_cast<float*>(&root_sum),
|
rabit::Allreduce<rabit::op::Sum, float>(reinterpret_cast<float*>(&root_sum),
|
||||||
2);
|
2);
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Copyright 2017 XGBoost contributors
|
* Copyright 2017 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
@ -122,6 +121,14 @@ void TestSegmentedUniqueRegression(std::vector<SketchEntry> values, size_t n_dup
|
|||||||
ASSERT_EQ(segments.at(1), d_segments_out[1] + n_duplicated);
|
ASSERT_EQ(segments.at(1), d_segments_out[1] + n_duplicated);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(DeviceHelpers, Reduce) {
|
||||||
|
size_t kSize = std::numeric_limits<uint32_t>::max();
|
||||||
|
auto it = thrust::make_counting_iterator(0ul);
|
||||||
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
auto batched = dh::Reduce(thrust::cuda::par(alloc), it, it + kSize, 0ul, thrust::maximum<size_t>{});
|
||||||
|
CHECK_EQ(batched, kSize - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(SegmentedUnique, Regression) {
|
TEST(SegmentedUnique, Regression) {
|
||||||
{
|
{
|
||||||
|
|||||||
@ -234,5 +234,4 @@ TEST(EllpackPage, Compact) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user