diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index f66707552..dae5a955a 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -129,6 +129,12 @@ inline size_t AvailableMemory(int device_idx) { return device_free; } +inline int32_t CurrentDevice() { + int32_t device = 0; + safe_cuda(cudaGetDevice(&device)); + return device; +} + inline size_t TotalMemory(int device_idx) { size_t device_free = 0; size_t device_total = 0; @@ -384,6 +390,16 @@ template using XGBBaseDeviceAllocator = thrust::device_malloc_allocator; #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +inline void ThrowOOMError(std::string const& err, size_t bytes) { + auto device = CurrentDevice(); + auto rank = rabit::GetRank(); + std::stringstream ss; + ss << "Memory allocation error on worker " << rank << ": " << err << "\n" + << "- Free memory: " << AvailableMemory(device) << "\n" + << "- Requested memory: " << bytes << std::endl; + LOG(FATAL) << ss.str(); +} + /** * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose. */ @@ -397,7 +413,13 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { using other = XGBDefaultDeviceAllocatorImpl; // NOLINT }; pointer allocate(size_t n) { // NOLINT - pointer ptr = SuperT::allocate(n); + pointer ptr; + try { + ptr = SuperT::allocate(n); + dh::safe_cuda(cudaGetLastError()); + } catch (const std::exception &e) { + ThrowOOMError(e.what(), n * sizeof(T)); + } GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); return ptr; } @@ -432,8 +454,11 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { } pointer allocate(size_t n) { // NOLINT T* ptr; - GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&ptr), - n * sizeof(T)); + auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&ptr), + n * sizeof(T)); + if (errc != cudaSuccess) { + ThrowOOMError("Caching allocator", n * sizeof(T)); + } pointer thrust_ptr{ ptr }; GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); return thrust_ptr; diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu index f8b775c7a..43e75bb21 100644 --- a/src/data/simple_dmatrix.cu +++ b/src/data/simple_dmatrix.cu @@ -35,24 +35,38 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, thrust::device_pointer_cast(offset.data())); } +template +struct COOToEntryOp { + AdapterBatchT batch; + __device__ Entry operator()(size_t idx) { + const auto& e = batch.GetElement(idx); + return Entry(e.column_idx, e.value); + } +}; + // Here the data is already correctly ordered and simply needs to be compacted // to remove missing data template void CopyDataToDMatrix(AdapterT* adapter, common::Span data, - int device_idx, float missing, - common::Span row_ptr) { - auto& batch = adapter->Value(); - auto transform_f = [=] __device__(size_t idx) { - const auto& e = batch.GetElement(idx); - return Entry(e.column_idx, e.value); - }; // NOLINT + float missing) { + auto batch = adapter->Value(); auto counting = thrust::make_counting_iterator(0llu); - thrust::transform_iterator - transform_iter(counting, transform_f); dh::XGBCachingDeviceAllocator alloc; - thrust::copy_if( - thrust::cuda::par(alloc), transform_iter, transform_iter + batch.Size(), - thrust::device_pointer_cast(data.data()), IsValidFunctor(missing)); + COOToEntryOp transform_op{batch}; + thrust::transform_iterator + transform_iter(counting, transform_op); + // We loop over batches because thrust::copy_if cant deal with sizes > 2^31 + // See thrust issue #1302 + size_t max_copy_size = std::numeric_limits::max() / 2; + auto begin_output = thrust::device_pointer_cast(data.data()); + for (size_t offset = 0; offset < batch.Size(); offset += max_copy_size) { + auto begin_input = transform_iter + offset; + auto end_input = + transform_iter + std::min(offset + max_copy_size, batch.Size()); + begin_output = + thrust::copy_if(thrust::cuda::par(alloc), begin_input, end_input, + begin_output, IsValidFunctor(missing)); + } } // Does not currently support metainfo as no on-device data source contains this @@ -77,8 +91,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing); info_.num_nonzero_ = sparse_page_.offset.HostVector().back(); sparse_page_.data.Resize(info_.num_nonzero_); - CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(), - adapter->DeviceIdx(), missing, s_offset); + CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(), missing); info_.num_col_ = adapter->NumColumns(); info_.num_row_ = adapter->NumRows(); diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 006c036d3..1652f5842 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -156,5 +156,13 @@ TEST(SegmentedUnique, Regression) { TestSegmentedUniqueRegression(values, 0); } } + +TEST(Allocator, OOM) { + auto size = dh::AvailableMemory(0) * 4; + ASSERT_THROW({dh::caching_device_vector vec(size);}, dmlc::Error); + ASSERT_THROW({dh::device_vector vec(size);}, dmlc::Error); + // Clear last error so we don't fail subsequent tests + cudaGetLastError(); +} } // namespace common -} // namespace xgboost \ No newline at end of file +} // namespace xgboost