Loop over copy_if (#6201)
* Loop over copy_if * Catch OOM. Co-authored-by: fis <jm.yuan@outlook.com>
This commit is contained in:
parent
0fc263ead5
commit
734a911a26
@ -129,6 +129,12 @@ inline size_t AvailableMemory(int device_idx) {
|
|||||||
return device_free;
|
return device_free;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int32_t CurrentDevice() {
|
||||||
|
int32_t device = 0;
|
||||||
|
safe_cuda(cudaGetDevice(&device));
|
||||||
|
return device;
|
||||||
|
}
|
||||||
|
|
||||||
inline size_t TotalMemory(int device_idx) {
|
inline size_t TotalMemory(int device_idx) {
|
||||||
size_t device_free = 0;
|
size_t device_free = 0;
|
||||||
size_t device_total = 0;
|
size_t device_total = 0;
|
||||||
@ -384,6 +390,16 @@ template <typename T>
|
|||||||
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
||||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
|
||||||
|
inline void ThrowOOMError(std::string const& err, size_t bytes) {
|
||||||
|
auto device = CurrentDevice();
|
||||||
|
auto rank = rabit::GetRank();
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
|
||||||
|
<< "- Free memory: " << AvailableMemory(device) << "\n"
|
||||||
|
<< "- Requested memory: " << bytes << std::endl;
|
||||||
|
LOG(FATAL) << ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
||||||
*/
|
*/
|
||||||
@ -397,7 +413,13 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|||||||
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
||||||
};
|
};
|
||||||
pointer allocate(size_t n) { // NOLINT
|
pointer allocate(size_t n) { // NOLINT
|
||||||
pointer ptr = SuperT::allocate(n);
|
pointer ptr;
|
||||||
|
try {
|
||||||
|
ptr = SuperT::allocate(n);
|
||||||
|
dh::safe_cuda(cudaGetLastError());
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
ThrowOOMError(e.what(), n * sizeof(T));
|
||||||
|
}
|
||||||
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -432,8 +454,11 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|||||||
}
|
}
|
||||||
pointer allocate(size_t n) { // NOLINT
|
pointer allocate(size_t n) { // NOLINT
|
||||||
T* ptr;
|
T* ptr;
|
||||||
GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
|
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
|
||||||
n * sizeof(T));
|
n * sizeof(T));
|
||||||
|
if (errc != cudaSuccess) {
|
||||||
|
ThrowOOMError("Caching allocator", n * sizeof(T));
|
||||||
|
}
|
||||||
pointer thrust_ptr{ ptr };
|
pointer thrust_ptr{ ptr };
|
||||||
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
||||||
return thrust_ptr;
|
return thrust_ptr;
|
||||||
|
|||||||
@ -35,24 +35,38 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
|||||||
thrust::device_pointer_cast(offset.data()));
|
thrust::device_pointer_cast(offset.data()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename AdapterBatchT>
|
||||||
|
struct COOToEntryOp {
|
||||||
|
AdapterBatchT batch;
|
||||||
|
__device__ Entry operator()(size_t idx) {
|
||||||
|
const auto& e = batch.GetElement(idx);
|
||||||
|
return Entry(e.column_idx, e.value);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Here the data is already correctly ordered and simply needs to be compacted
|
// Here the data is already correctly ordered and simply needs to be compacted
|
||||||
// to remove missing data
|
// to remove missing data
|
||||||
template <typename AdapterT>
|
template <typename AdapterT>
|
||||||
void CopyDataToDMatrix(AdapterT* adapter, common::Span<Entry> data,
|
void CopyDataToDMatrix(AdapterT* adapter, common::Span<Entry> data,
|
||||||
int device_idx, float missing,
|
float missing) {
|
||||||
common::Span<size_t> row_ptr) {
|
auto batch = adapter->Value();
|
||||||
auto& batch = adapter->Value();
|
|
||||||
auto transform_f = [=] __device__(size_t idx) {
|
|
||||||
const auto& e = batch.GetElement(idx);
|
|
||||||
return Entry(e.column_idx, e.value);
|
|
||||||
}; // NOLINT
|
|
||||||
auto counting = thrust::make_counting_iterator(0llu);
|
auto counting = thrust::make_counting_iterator(0llu);
|
||||||
thrust::transform_iterator<decltype(transform_f), decltype(counting), Entry>
|
|
||||||
transform_iter(counting, transform_f);
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
thrust::copy_if(
|
COOToEntryOp<decltype(batch)> transform_op{batch};
|
||||||
thrust::cuda::par(alloc), transform_iter, transform_iter + batch.Size(),
|
thrust::transform_iterator<decltype(transform_op), decltype(counting)>
|
||||||
thrust::device_pointer_cast(data.data()), IsValidFunctor(missing));
|
transform_iter(counting, transform_op);
|
||||||
|
// We loop over batches because thrust::copy_if cant deal with sizes > 2^31
|
||||||
|
// See thrust issue #1302
|
||||||
|
size_t max_copy_size = std::numeric_limits<int>::max() / 2;
|
||||||
|
auto begin_output = thrust::device_pointer_cast(data.data());
|
||||||
|
for (size_t offset = 0; offset < batch.Size(); offset += max_copy_size) {
|
||||||
|
auto begin_input = transform_iter + offset;
|
||||||
|
auto end_input =
|
||||||
|
transform_iter + std::min(offset + max_copy_size, batch.Size());
|
||||||
|
begin_output =
|
||||||
|
thrust::copy_if(thrust::cuda::par(alloc), begin_input, end_input,
|
||||||
|
begin_output, IsValidFunctor(missing));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Does not currently support metainfo as no on-device data source contains this
|
// Does not currently support metainfo as no on-device data source contains this
|
||||||
@ -77,8 +91,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
|||||||
CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing);
|
CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing);
|
||||||
info_.num_nonzero_ = sparse_page_.offset.HostVector().back();
|
info_.num_nonzero_ = sparse_page_.offset.HostVector().back();
|
||||||
sparse_page_.data.Resize(info_.num_nonzero_);
|
sparse_page_.data.Resize(info_.num_nonzero_);
|
||||||
CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(),
|
CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(), missing);
|
||||||
adapter->DeviceIdx(), missing, s_offset);
|
|
||||||
|
|
||||||
info_.num_col_ = adapter->NumColumns();
|
info_.num_col_ = adapter->NumColumns();
|
||||||
info_.num_row_ = adapter->NumRows();
|
info_.num_row_ = adapter->NumRows();
|
||||||
|
|||||||
@ -156,5 +156,13 @@ TEST(SegmentedUnique, Regression) {
|
|||||||
TestSegmentedUniqueRegression(values, 0);
|
TestSegmentedUniqueRegression(values, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Allocator, OOM) {
|
||||||
|
auto size = dh::AvailableMemory(0) * 4;
|
||||||
|
ASSERT_THROW({dh::caching_device_vector<char> vec(size);}, dmlc::Error);
|
||||||
|
ASSERT_THROW({dh::device_vector<char> vec(size);}, dmlc::Error);
|
||||||
|
// Clear last error so we don't fail subsequent tests
|
||||||
|
cudaGetLastError();
|
||||||
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user