/*! * Copyright 2019 by Contributors * \file simple_dmatrix.cu */ #include #include #include #include #include "../common/random.h" #include "./simple_dmatrix.h" #include "device_adapter.cuh" namespace xgboost { namespace data { template void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, int device_idx, float missing) { IsValidFunctor is_valid(missing); // Count elements per row dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { auto element = batch.GetElement(idx); if (is_valid(element)) { atomicAdd(reinterpret_cast( // NOLINT &offset[element.row_idx]), static_cast(1)); // NOLINT } }); dh::XGBCachingDeviceAllocator alloc; thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), thrust::device_pointer_cast(offset.data() + offset.size()), thrust::device_pointer_cast(offset.data())); } template struct COOToEntryOp { AdapterBatchT batch; __device__ Entry operator()(size_t idx) { const auto& e = batch.GetElement(idx); return Entry(e.column_idx, e.value); } }; // Here the data is already correctly ordered and simply needs to be compacted // to remove missing data template void CopyDataToDMatrix(AdapterT* adapter, common::Span data, float missing) { auto batch = adapter->Value(); auto counting = thrust::make_counting_iterator(0llu); dh::XGBCachingDeviceAllocator alloc; COOToEntryOp transform_op{batch}; thrust::transform_iterator transform_iter(counting, transform_op); // We loop over batches because thrust::copy_if cant deal with sizes > 2^31 // See thrust issue #1302 size_t max_copy_size = std::numeric_limits::max() / 2; auto begin_output = thrust::device_pointer_cast(data.data()); for (size_t offset = 0; offset < batch.Size(); offset += max_copy_size) { auto begin_input = transform_iter + offset; auto end_input = transform_iter + std::min(offset + max_copy_size, batch.Size()); begin_output = thrust::copy_if(thrust::cuda::par(alloc), begin_input, end_input, begin_output, IsValidFunctor(missing)); } } // Does not currently support metainfo as no on-device data source contains this // Current implementation assumes a single batch. More batches can // be supported in future. Does not currently support inferring row/column size template SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { dh::safe_cuda(cudaSetDevice(adapter->DeviceIdx())); CHECK(adapter->NumRows() != kAdapterUnknownSize); CHECK(adapter->NumColumns() != kAdapterUnknownSize); adapter->BeforeFirst(); adapter->Next(); auto& batch = adapter->Value(); sparse_page_.offset.SetDevice(adapter->DeviceIdx()); sparse_page_.data.SetDevice(adapter->DeviceIdx()); // Enforce single batch CHECK(!adapter->Next()); sparse_page_.offset.Resize(adapter->NumRows() + 1); auto s_offset = sparse_page_.offset.DeviceSpan(); CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing); info_.num_nonzero_ = sparse_page_.offset.HostVector().back(); sparse_page_.data.Resize(info_.num_nonzero_); CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(), missing); info_.num_col_ = adapter->NumColumns(); info_.num_row_ = adapter->NumRows(); // Synchronise worker columns rabit::Allreduce(&info_.num_col_, 1); } template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing, int nthread); template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing, int nthread); } // namespace data } // namespace xgboost