Check cub errors. (#10721)
- Make sure cuda error returned by cub scan is caught. - Avoid temporary buffer allocation in thrust device vector.
This commit is contained in:
parent
b949a4bf7b
commit
508ac13243
@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
|
|||||||
ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
|
ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const> d_row_ptr,
|
void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
|
||||||
size_t row_stride, common::CompressedByteT* d_compressed_buffer,
|
common::Span<size_t const> d_row_ptr, size_t row_stride,
|
||||||
size_t null) {
|
common::CompressedByteT* d_compressed_buffer, size_t null) {
|
||||||
dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
|
dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
|
||||||
auto d_data = dh::ToSpan(data);
|
auto d_data = dh::ToSpan(data);
|
||||||
|
|
||||||
@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
|
|||||||
common::CompressedBufferWriter writer{page.cut.TotalBins() +
|
common::CompressedBufferWriter writer{page.cut.TotalBins() +
|
||||||
static_cast<std::size_t>(1)}; // +1 for null value
|
static_cast<std::size_t>(1)}; // +1 for null value
|
||||||
|
|
||||||
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
|
auto cuctx = ctx->CUDACtx();
|
||||||
|
dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
|
||||||
auto ridx = idx / row_stride;
|
auto ridx = idx / row_stride;
|
||||||
auto ifeature = idx % row_stride;
|
auto ifeature = idx % row_stride;
|
||||||
|
|
||||||
@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t offset = 0;
|
bst_idx_t offset = 0;
|
||||||
if (!d_csc_indptr.empty()) {
|
if (!d_csc_indptr.empty()) {
|
||||||
// is dense, ifeature is the actual feature index.
|
// is dense, ifeature is the actual feature index.
|
||||||
offset = d_csc_indptr[ifeature];
|
offset = d_csc_indptr[ifeature];
|
||||||
@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
|||||||
row_stride = *std::max_element(it, it + page.Size());
|
row_stride = *std::max_element(it, it + page.Size());
|
||||||
|
|
||||||
CHECK(ctx->IsCUDA());
|
CHECK(ctx->IsCUDA());
|
||||||
InitCompressedData(ctx);
|
this->InitCompressedData(ctx);
|
||||||
|
|
||||||
// copy gidx
|
// copy gidx
|
||||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
|
common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
|
||||||
@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
|||||||
|
|
||||||
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
|
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
|
||||||
auto null = accessor.NullValue();
|
auto null = accessor.NullValue();
|
||||||
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
this->monitor_.Start("CopyGHistToEllpack");
|
||||||
|
CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
||||||
|
this->monitor_.Stop("CopyGHistToEllpack");
|
||||||
}
|
}
|
||||||
|
|
||||||
// A functor that copies the data from one EllpackPage to another.
|
// A functor that copies the data from one EllpackPage to another.
|
||||||
|
|||||||
@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
|
|||||||
|
|
||||||
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
|
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
|
||||||
EvaluateSplitSharedInputs shared_inputs) {
|
EvaluateSplitSharedInputs shared_inputs) {
|
||||||
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
|
dh::device_vector<EvaluateSplitInputs> inputs(1);
|
||||||
|
dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
|
||||||
|
|
||||||
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
|
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
|
||||||
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
|
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
|
||||||
shared_inputs, dh::ToSpan(out_entries));
|
shared_inputs, dh::ToSpan(out_entries));
|
||||||
|
|||||||
@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl {
|
|||||||
void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
|
void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
|
||||||
FeatureGroupsAccessor const& feature_groups,
|
FeatureGroupsAccessor const& feature_groups,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const std::uint32_t> d_ridx,
|
common::Span<const cuda_impl::RowIndexT> d_ridx,
|
||||||
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
|
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
|
||||||
CHECK(kernel_);
|
CHECK(kernel_);
|
||||||
// Otherwise launch blocks such that each block has a minimum amount of work to do
|
// Otherwise launch blocks such that each block has a minimum amount of work to do
|
||||||
@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
|
|||||||
EllpackDeviceAccessor const& matrix,
|
EllpackDeviceAccessor const& matrix,
|
||||||
FeatureGroupsAccessor const& feature_groups,
|
FeatureGroupsAccessor const& feature_groups,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const std::uint32_t> ridx,
|
common::Span<const cuda_impl::RowIndexT> ridx,
|
||||||
common::Span<GradientPairInt64> histogram,
|
common::Span<GradientPairInt64> histogram,
|
||||||
GradientQuantiser rounding) {
|
GradientQuantiser rounding) {
|
||||||
this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
|
this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
|
||||||
|
|||||||
@ -132,7 +132,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
|||||||
common::Span<cuda_impl::RowIndexT> ridx,
|
common::Span<cuda_impl::RowIndexT> ridx,
|
||||||
common::Span<cuda_impl::RowIndexT> ridx_tmp,
|
common::Span<cuda_impl::RowIndexT> ridx_tmp,
|
||||||
common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
|
common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
|
||||||
dh::device_vector<int8_t>* tmp) {
|
dh::DeviceUVector<int8_t>* tmp) {
|
||||||
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
|
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
|
||||||
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
|
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
|
||||||
d_counts.data()};
|
d_counts.data()};
|
||||||
@ -150,14 +150,16 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
|||||||
go_left};
|
go_left};
|
||||||
});
|
});
|
||||||
std::size_t temp_bytes = 0;
|
std::size_t temp_bytes = 0;
|
||||||
|
// Restriction imposed by cub.
|
||||||
|
CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
|
||||||
if (tmp->empty()) {
|
if (tmp->empty()) {
|
||||||
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
|
dh::safe_cuda(cub::DeviceScan::InclusiveScan(
|
||||||
IndexFlagOp{}, total_rows);
|
nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
|
||||||
tmp->resize(temp_bytes);
|
tmp->resize(temp_bytes);
|
||||||
}
|
}
|
||||||
temp_bytes = tmp->size();
|
temp_bytes = tmp->size();
|
||||||
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
|
dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
|
||||||
discard_write_iterator, IndexFlagOp{}, total_rows);
|
discard_write_iterator, IndexFlagOp{}, total_rows));
|
||||||
|
|
||||||
constexpr int kBlockSize = 256;
|
constexpr int kBlockSize = 256;
|
||||||
|
|
||||||
@ -236,7 +238,7 @@ class RowPartitioner {
|
|||||||
dh::DeviceUVector<RowIndexT> ridx_;
|
dh::DeviceUVector<RowIndexT> ridx_;
|
||||||
// Staging area for sorting ridx
|
// Staging area for sorting ridx
|
||||||
dh::DeviceUVector<RowIndexT> ridx_tmp_;
|
dh::DeviceUVector<RowIndexT> ridx_tmp_;
|
||||||
dh::device_vector<int8_t> tmp_;
|
dh::DeviceUVector<int8_t> tmp_;
|
||||||
dh::PinnedMemory pinned_;
|
dh::PinnedMemory pinned_;
|
||||||
dh::PinnedMemory pinned2_;
|
dh::PinnedMemory pinned2_;
|
||||||
bst_node_t n_nodes_{0}; // Counter for internal checks.
|
bst_node_t n_nodes_{0}; // Counter for internal checks.
|
||||||
|
|||||||
@ -49,9 +49,9 @@ void TestUpdatePositionBatch() {
|
|||||||
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
|
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
|
||||||
|
|
||||||
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
|
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
|
||||||
thrust::device_vector<uint32_t> ridx = ridx_in;
|
thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
|
||||||
thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
|
thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
|
||||||
thrust::device_vector<bst_uint> counts(segments.size());
|
thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
|
||||||
|
|
||||||
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
|
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
|
||||||
std::vector<int> op_data(segments.size());
|
std::vector<int> op_data(segments.size());
|
||||||
@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
|
|||||||
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||||
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
|
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
|
||||||
nullptr));
|
nullptr));
|
||||||
dh::device_vector<int8_t> tmp;
|
dh::DeviceUVector<int8_t> tmp;
|
||||||
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
|
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
|
||||||
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
|
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
|
||||||
&tmp);
|
&tmp);
|
||||||
@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) {
|
|||||||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
|
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
|
||||||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
|
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xgboost::tree
|
} // namespace xgboost::tree
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user