Check cub errors. (#10721)
- Make sure cuda error returned by cub scan is caught. - Avoid temporary buffer allocation in thrust device vector.
This commit is contained in:
parent
b949a4bf7b
commit
508ac13243
@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
|
||||
ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
|
||||
|
||||
namespace {
|
||||
void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const> d_row_ptr,
|
||||
size_t row_stride, common::CompressedByteT* d_compressed_buffer,
|
||||
size_t null) {
|
||||
void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
|
||||
common::Span<size_t const> d_row_ptr, size_t row_stride,
|
||||
common::CompressedByteT* d_compressed_buffer, size_t null) {
|
||||
dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
|
||||
auto d_data = dh::ToSpan(data);
|
||||
|
||||
@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
|
||||
common::CompressedBufferWriter writer{page.cut.TotalBins() +
|
||||
static_cast<std::size_t>(1)}; // +1 for null value
|
||||
|
||||
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
|
||||
auto ridx = idx / row_stride;
|
||||
auto ifeature = idx % row_stride;
|
||||
|
||||
@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
|
||||
return;
|
||||
}
|
||||
|
||||
size_t offset = 0;
|
||||
bst_idx_t offset = 0;
|
||||
if (!d_csc_indptr.empty()) {
|
||||
// is dense, ifeature is the actual feature index.
|
||||
offset = d_csc_indptr[ifeature];
|
||||
@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
row_stride = *std::max_element(it, it + page.Size());
|
||||
|
||||
CHECK(ctx->IsCUDA());
|
||||
InitCompressedData(ctx);
|
||||
this->InitCompressedData(ctx);
|
||||
|
||||
// copy gidx
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
|
||||
@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
|
||||
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
|
||||
auto null = accessor.NullValue();
|
||||
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
||||
this->monitor_.Start("CopyGHistToEllpack");
|
||||
CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
||||
this->monitor_.Stop("CopyGHistToEllpack");
|
||||
}
|
||||
|
||||
// A functor that copies the data from one EllpackPage to another.
|
||||
|
||||
@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
|
||||
|
||||
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
|
||||
EvaluateSplitSharedInputs shared_inputs) {
|
||||
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
|
||||
dh::device_vector<EvaluateSplitInputs> inputs(1);
|
||||
dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
|
||||
|
||||
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
|
||||
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
|
||||
shared_inputs, dh::ToSpan(out_entries));
|
||||
|
||||
@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl {
|
||||
void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
|
||||
FeatureGroupsAccessor const& feature_groups,
|
||||
common::Span<GradientPair const> gpair,
|
||||
common::Span<const std::uint32_t> d_ridx,
|
||||
common::Span<const cuda_impl::RowIndexT> d_ridx,
|
||||
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
|
||||
CHECK(kernel_);
|
||||
// Otherwise launch blocks such that each block has a minimum amount of work to do
|
||||
@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
|
||||
EllpackDeviceAccessor const& matrix,
|
||||
FeatureGroupsAccessor const& feature_groups,
|
||||
common::Span<GradientPair const> gpair,
|
||||
common::Span<const std::uint32_t> ridx,
|
||||
common::Span<const cuda_impl::RowIndexT> ridx,
|
||||
common::Span<GradientPairInt64> histogram,
|
||||
GradientQuantiser rounding) {
|
||||
this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
|
||||
|
||||
@ -132,7 +132,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
||||
common::Span<cuda_impl::RowIndexT> ridx,
|
||||
common::Span<cuda_impl::RowIndexT> ridx_tmp,
|
||||
common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
|
||||
dh::device_vector<int8_t>* tmp) {
|
||||
dh::DeviceUVector<int8_t>* tmp) {
|
||||
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
|
||||
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
|
||||
d_counts.data()};
|
||||
@ -150,14 +150,16 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
||||
go_left};
|
||||
});
|
||||
std::size_t temp_bytes = 0;
|
||||
// Restriction imposed by cub.
|
||||
CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
|
||||
if (tmp->empty()) {
|
||||
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
|
||||
IndexFlagOp{}, total_rows);
|
||||
dh::safe_cuda(cub::DeviceScan::InclusiveScan(
|
||||
nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
|
||||
tmp->resize(temp_bytes);
|
||||
}
|
||||
temp_bytes = tmp->size();
|
||||
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
|
||||
discard_write_iterator, IndexFlagOp{}, total_rows);
|
||||
dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
|
||||
discard_write_iterator, IndexFlagOp{}, total_rows));
|
||||
|
||||
constexpr int kBlockSize = 256;
|
||||
|
||||
@ -236,7 +238,7 @@ class RowPartitioner {
|
||||
dh::DeviceUVector<RowIndexT> ridx_;
|
||||
// Staging area for sorting ridx
|
||||
dh::DeviceUVector<RowIndexT> ridx_tmp_;
|
||||
dh::device_vector<int8_t> tmp_;
|
||||
dh::DeviceUVector<int8_t> tmp_;
|
||||
dh::PinnedMemory pinned_;
|
||||
dh::PinnedMemory pinned2_;
|
||||
bst_node_t n_nodes_{0}; // Counter for internal checks.
|
||||
|
||||
@ -49,9 +49,9 @@ void TestUpdatePositionBatch() {
|
||||
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
|
||||
|
||||
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
|
||||
thrust::device_vector<uint32_t> ridx = ridx_in;
|
||||
thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
|
||||
thrust::device_vector<bst_uint> counts(segments.size());
|
||||
thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
|
||||
thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
|
||||
thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
|
||||
|
||||
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
|
||||
std::vector<int> op_data(segments.size());
|
||||
@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
|
||||
nullptr));
|
||||
dh::device_vector<int8_t> tmp;
|
||||
dh::DeviceUVector<int8_t> tmp;
|
||||
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
|
||||
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
|
||||
&tmp);
|
||||
@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) {
|
||||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
|
||||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
|
||||
}
|
||||
|
||||
} // namespace xgboost::tree
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user