[EM] Support mmap backed ellpack. (#10602)
- Support resource view in ellpack. - Define the CUDA version of MMAP resource. - Define the CUDA version of malloc resource. - Refactor cuda runtime API wrappers, and add memory access related wrappers. - gather windows macros into a single header.
This commit is contained in:
@@ -11,8 +11,9 @@
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/cuda_context.cuh"
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
#include "device_adapter.cuh" // for NoInfInData
|
||||
#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
#include "device_adapter.cuh" // for NoInfInData
|
||||
#include "ellpack_page.cuh"
|
||||
#include "ellpack_page.h"
|
||||
#include "gradient_index.h"
|
||||
@@ -43,21 +44,19 @@ __global__ void CompressBinEllpackKernel(
|
||||
common::CompressedBufferWriter wr,
|
||||
common::CompressedByteT* __restrict__ buffer, // gidx_buffer
|
||||
const size_t* __restrict__ row_ptrs, // row offset of input data
|
||||
const Entry* __restrict__ entries, // One batch of input data
|
||||
const float* __restrict__ cuts, // HistogramCuts::cut_values_
|
||||
const uint32_t* __restrict__ cut_ptrs, // HistogramCuts::cut_ptrs_
|
||||
const Entry* __restrict__ entries, // One batch of input data
|
||||
const float* __restrict__ cuts, // HistogramCuts::cut_values_
|
||||
const uint32_t* __restrict__ cut_ptrs, // HistogramCuts::cut_ptrs_
|
||||
common::Span<FeatureType const> feature_types,
|
||||
size_t base_row, // batch_row_begin
|
||||
size_t n_rows,
|
||||
size_t row_stride,
|
||||
unsigned int null_gidx_value) {
|
||||
size_t base_row, // batch_row_begin
|
||||
size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) {
|
||||
size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
if (irow >= n_rows || ifeature >= row_stride) {
|
||||
return;
|
||||
}
|
||||
int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
|
||||
unsigned int bin = null_gidx_value;
|
||||
std::uint32_t bin = null_gidx_value;
|
||||
if (ifeature < row_length) {
|
||||
Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
|
||||
int feature = entry.index;
|
||||
@@ -89,25 +88,23 @@ __global__ void CompressBinEllpackKernel(
|
||||
}
|
||||
|
||||
// Construct an ELLPACK matrix with the given number of empty rows.
|
||||
EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
|
||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx,
|
||||
std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
|
||||
bst_idx_t row_stride, bst_idx_t n_rows)
|
||||
: is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} {
|
||||
monitor_.Init("ellpack_page");
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
this->InitCompressedData(device);
|
||||
monitor_.Stop("InitCompressedData");
|
||||
this->InitCompressedData(ctx);
|
||||
}
|
||||
|
||||
EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
|
||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx,
|
||||
std::shared_ptr<common::HistogramCuts const> cuts,
|
||||
const SparsePage& page, bool is_dense, size_t row_stride,
|
||||
common::Span<FeatureType const> feature_types)
|
||||
: cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) {
|
||||
this->InitCompressedData(device);
|
||||
this->CreateHistIndices(device, page, feature_types);
|
||||
this->InitCompressedData(ctx);
|
||||
this->CreateHistIndices(ctx->Device(), page, feature_types);
|
||||
}
|
||||
|
||||
// Construct an ELLPACK matrix in memory.
|
||||
@@ -129,9 +126,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
|
||||
}
|
||||
monitor_.Stop("Quantiles");
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
this->InitCompressedData(ctx->Device());
|
||||
monitor_.Stop("InitCompressedData");
|
||||
this->InitCompressedData(ctx);
|
||||
|
||||
dmat->Info().feature_types.SetDevice(ctx->Device());
|
||||
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
||||
@@ -234,7 +229,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
||||
|
||||
auto device_accessor = dst->GetDeviceAccessor(device);
|
||||
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
||||
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
||||
auto d_compressed_buffer = dst->gidx_buffer.data();
|
||||
|
||||
// We redirect the scan output into this functor to do the actual writing
|
||||
WriteCompressedEllpackFunctor<AdapterBatchT> functor(
|
||||
@@ -275,7 +270,7 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
|
||||
// Write the null values
|
||||
auto device_accessor = dst->GetDeviceAccessor(device);
|
||||
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
||||
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
||||
auto d_compressed_buffer = dst->gidx_buffer.data();
|
||||
auto row_stride = dst->row_stride;
|
||||
dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) {
|
||||
// For some reason this variable got captured as const
|
||||
@@ -290,20 +285,20 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
|
||||
}
|
||||
|
||||
template <typename AdapterBatch>
|
||||
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
|
||||
common::Span<size_t> row_counts_span,
|
||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
|
||||
bool is_dense, common::Span<size_t> row_counts_span,
|
||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||
size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts) {
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
|
||||
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
|
||||
CopyDataToEllpack(batch, feature_types, this, device, missing);
|
||||
WriteNullValues(this, device, row_counts_span);
|
||||
*this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows);
|
||||
CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing);
|
||||
WriteNullValues(this, ctx->Device(), row_counts_span);
|
||||
}
|
||||
|
||||
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
|
||||
template EllpackPageImpl::EllpackPageImpl( \
|
||||
__BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \
|
||||
Context const* ctx, __BATCH_T batch, float missing, bool is_dense, \
|
||||
common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
|
||||
size_t row_stride, size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
|
||||
|
||||
@@ -365,12 +360,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
row_stride = *std::max_element(it, it + page.Size());
|
||||
|
||||
CHECK(ctx->IsCUDA());
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(ctx->Device());
|
||||
monitor_.Stop("InitCompressedData");
|
||||
InitCompressedData(ctx);
|
||||
|
||||
// copy gidx
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
|
||||
auto d_row_ptr = dh::ToSpan(row_ptr);
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
@@ -389,20 +382,20 @@ struct CopyPage {
|
||||
// The number of elements to skip.
|
||||
size_t offset;
|
||||
|
||||
CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset)
|
||||
: cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||
CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset)
|
||||
: cbw{dst->NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.data()},
|
||||
src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
|
||||
offset(offset) {}
|
||||
|
||||
__device__ void operator()(size_t element_id) {
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
|
||||
element_id + offset);
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
|
||||
}
|
||||
};
|
||||
|
||||
// Copy the data from the given EllpackPage to the current page.
|
||||
size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
|
||||
monitor_.Start("Copy");
|
||||
size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
|
||||
monitor_.Start(__func__);
|
||||
bst_idx_t num_elements = page->n_rows * page->row_stride;
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||
@@ -411,10 +404,8 @@ size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size
|
||||
LOG(FATAL) << "Concatenating the same Ellpack.";
|
||||
return this->n_rows * this->row_stride;
|
||||
}
|
||||
gidx_buffer.SetDevice(device);
|
||||
page->gidx_buffer.SetDevice(device);
|
||||
dh::LaunchN(num_elements, CopyPage(this, page, offset));
|
||||
monitor_.Stop("Copy");
|
||||
dh::LaunchN(num_elements, CopyPage{this, page, offset});
|
||||
monitor_.Stop(__func__);
|
||||
return num_elements;
|
||||
}
|
||||
|
||||
@@ -423,8 +414,8 @@ struct CompactPage {
|
||||
common::CompressedBufferWriter cbw;
|
||||
common::CompressedByteT* dst_data_d;
|
||||
common::CompressedIterator<uint32_t> src_iterator_d;
|
||||
/*! \brief An array that maps the rows from the full DMatrix to the compacted
|
||||
* page.
|
||||
/**
|
||||
* @brief An array that maps the rows from the full DMatrix to the compacted page.
|
||||
*
|
||||
* The total size is the number of rows in the original, uncompacted DMatrix.
|
||||
* Elements are the row ids in the compacted page. Rows not needed are set to
|
||||
@@ -438,24 +429,24 @@ struct CompactPage {
|
||||
size_t base_rowid;
|
||||
size_t row_stride;
|
||||
|
||||
CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src,
|
||||
common::Span<size_t> row_indexes)
|
||||
CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span<size_t> row_indexes)
|
||||
: cbw{dst->NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.data()},
|
||||
src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
|
||||
row_indexes(row_indexes),
|
||||
base_rowid{src->base_rowid},
|
||||
row_stride{src->row_stride} {}
|
||||
|
||||
__device__ void operator()(size_t row_id) {
|
||||
__device__ void operator()(bst_idx_t row_id) {
|
||||
size_t src_row = base_rowid + row_id;
|
||||
size_t dst_row = row_indexes[src_row];
|
||||
if (dst_row == SIZE_MAX) return;
|
||||
if (dst_row == SIZE_MAX) {
|
||||
return;
|
||||
}
|
||||
size_t dst_offset = dst_row * row_stride;
|
||||
size_t src_offset = row_id * row_stride;
|
||||
for (size_t j = 0; j < row_stride; j++) {
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
|
||||
dst_offset + j);
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -467,28 +458,22 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
|
||||
gidx_buffer.SetDevice(ctx->Device());
|
||||
page->gidx_buffer.SetDevice(ctx->Device());
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes));
|
||||
dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes});
|
||||
monitor_.Stop(__func__);
|
||||
}
|
||||
|
||||
// Initialize the buffer to stored compressed features.
|
||||
void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
|
||||
size_t num_symbols = NumSymbols();
|
||||
void EllpackPageImpl::InitCompressedData(Context const* ctx) {
|
||||
monitor_.Start(__func__);
|
||||
auto num_symbols = NumSymbols();
|
||||
|
||||
// Required buffer size for storing data matrix in ELLPack format.
|
||||
size_t compressed_size_bytes =
|
||||
std::size_t compressed_size_bytes =
|
||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
|
||||
gidx_buffer.SetDevice(device);
|
||||
// Don't call fill unnecessarily
|
||||
if (gidx_buffer.Size() == 0) {
|
||||
gidx_buffer.Resize(compressed_size_bytes, 0);
|
||||
} else {
|
||||
gidx_buffer.Resize(compressed_size_bytes, 0);
|
||||
thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
|
||||
}
|
||||
auto init = static_cast<common::CompressedByteT>(0);
|
||||
gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init);
|
||||
monitor_.Stop(__func__);
|
||||
}
|
||||
|
||||
// Compress a CSR page into ELLPACK.
|
||||
@@ -496,7 +481,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
|
||||
const SparsePage& row_batch,
|
||||
common::Span<FeatureType const> feature_types) {
|
||||
if (row_batch.Size() == 0) return;
|
||||
unsigned int null_gidx_value = NumSymbols() - 1;
|
||||
std::uint32_t null_gidx_value = NumSymbols() - 1;
|
||||
|
||||
const auto& offset_vec = row_batch.offset.ConstHostVector();
|
||||
|
||||
@@ -541,13 +526,11 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
|
||||
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||
common::DivRoundUp(row_stride, block3.y), 1);
|
||||
auto device_accessor = GetDeviceAccessor(device);
|
||||
dh::LaunchKernel {grid3, block3}(
|
||||
CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
|
||||
gidx_buffer.DevicePointer(), row_ptrs.data().get(),
|
||||
entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
|
||||
device_accessor.feature_segments.data(), feature_types,
|
||||
batch_row_begin, batch_nrows, row_stride,
|
||||
null_gidx_value);
|
||||
dh::LaunchKernel{grid3, block3}( // NOLINT
|
||||
CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(),
|
||||
row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
|
||||
device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
|
||||
row_stride, null_gidx_value);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -566,26 +549,31 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
|
||||
|
||||
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
|
||||
DeviceOrd device, common::Span<FeatureType const> feature_types) const {
|
||||
gidx_buffer.SetDevice(device);
|
||||
return {device,
|
||||
cuts_,
|
||||
is_dense,
|
||||
row_stride,
|
||||
base_rowid,
|
||||
n_rows,
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
|
||||
NumSymbols()),
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.data(), NumSymbols()),
|
||||
feature_types};
|
||||
}
|
||||
|
||||
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
|
||||
Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
|
||||
common::Span<FeatureType const> feature_types) const {
|
||||
h_gidx_buffer->resize(gidx_buffer.size());
|
||||
CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size());
|
||||
CHECK_NE(gidx_buffer.size(), 0);
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
|
||||
cudaMemcpyDefault, dh::DefaultStream()));
|
||||
return {DeviceOrd::CPU(),
|
||||
cuts_,
|
||||
is_dense,
|
||||
row_stride,
|
||||
base_rowid,
|
||||
n_rows,
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.ConstHostPointer(), NumSymbols()),
|
||||
common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), NumSymbols()),
|
||||
feature_types};
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,23 +1,25 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
* Copyright 2019-2024, XGBoost Contributors
|
||||
*/
|
||||
|
||||
#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
|
||||
#define XGBOOST_DATA_ELLPACK_PAGE_CUH_
|
||||
|
||||
#include <thrust/binary_search.h>
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/compressed_iterator.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/ref_resource_view.h" // for RefResourceView
|
||||
#include "ellpack_page.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost {
|
||||
/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
|
||||
* device. Does not own underlying memory and may be trivially copied into
|
||||
* kernels.*/
|
||||
/**
|
||||
* @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
|
||||
*
|
||||
* Does not own underlying memory and may be trivially copied into kernels.
|
||||
*/
|
||||
struct EllpackDeviceAccessor {
|
||||
/*! \brief Whether or not if the matrix is dense. */
|
||||
bool is_dense;
|
||||
@@ -128,31 +130,31 @@ class GHistIndexMatrix;
|
||||
|
||||
class EllpackPageImpl {
|
||||
public:
|
||||
/*!
|
||||
* \brief Default constructor.
|
||||
/**
|
||||
* @brief Default constructor.
|
||||
*
|
||||
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
|
||||
* set later by the reader.
|
||||
*/
|
||||
EllpackPageImpl() = default;
|
||||
|
||||
/*!
|
||||
* \brief Constructor from an existing EllpackInfo.
|
||||
/**
|
||||
* @brief Constructor from an existing EllpackInfo.
|
||||
*
|
||||
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
||||
* and the given number of rows.
|
||||
* This is used in the sampling case. The ELLPACK page is constructed from an existing
|
||||
* Ellpack page and the given number of rows.
|
||||
*/
|
||||
EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
|
||||
EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
|
||||
bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
|
||||
/*!
|
||||
* \brief Constructor used for external memory.
|
||||
/**
|
||||
* @brief Constructor used for external memory.
|
||||
*/
|
||||
EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
|
||||
EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
|
||||
const SparsePage& page, bool is_dense, size_t row_stride,
|
||||
common::Span<FeatureType const> feature_types);
|
||||
|
||||
/*!
|
||||
* \brief Constructor from an existing DMatrix.
|
||||
/**
|
||||
* @brief Constructor from an existing DMatrix.
|
||||
*
|
||||
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
|
||||
* in CSR format.
|
||||
@@ -160,37 +162,39 @@ class EllpackPageImpl {
|
||||
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
|
||||
|
||||
template <typename AdapterBatch>
|
||||
explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
|
||||
explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
|
||||
common::Span<size_t> row_counts_span,
|
||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||
size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
|
||||
/**
|
||||
* \brief Constructor from an existing CPU gradient index.
|
||||
* @brief Constructor from an existing CPU gradient index.
|
||||
*/
|
||||
explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
|
||||
common::Span<FeatureType const> ft);
|
||||
|
||||
/*! \brief Copy the elements of the given ELLPACK page into this page.
|
||||
/**
|
||||
* @brief Copy the elements of the given ELLPACK page into this page.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param ctx The GPU context.
|
||||
* @param page The ELLPACK page to copy from.
|
||||
* @param offset The number of elements to skip before copying.
|
||||
* @returns The number of elements copied.
|
||||
*/
|
||||
size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
|
||||
bst_idx_t Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset);
|
||||
|
||||
/*! \brief Compact the given ELLPACK page into the current page.
|
||||
/**
|
||||
* @brief Compact the given ELLPACK page into the current page.
|
||||
*
|
||||
* @param context The GPU context.
|
||||
* @param ctx The GPU context.
|
||||
* @param page The ELLPACK page to compact from.
|
||||
* @param row_indexes Row indexes for the compacted page.
|
||||
*/
|
||||
void Compact(Context const* ctx, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
|
||||
|
||||
/*! \return Number of instances in the page. */
|
||||
/** @return Number of instances in the page. */
|
||||
[[nodiscard]] bst_idx_t Size() const;
|
||||
|
||||
/*! \brief Set the base row id for this page. */
|
||||
/** @brief Set the base row id for this page. */
|
||||
void SetBaseRowId(std::size_t row_id) {
|
||||
base_rowid = row_id;
|
||||
}
|
||||
@@ -199,43 +203,54 @@ class EllpackPageImpl {
|
||||
[[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
|
||||
void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
|
||||
|
||||
/*! \return Estimation of memory cost of this page. */
|
||||
/** @return Estimation of memory cost of this page. */
|
||||
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
|
||||
|
||||
|
||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||
* not found). */
|
||||
/**
|
||||
* @brief Return the total number of symbols (total number of bins plus 1 for not
|
||||
* found).
|
||||
*/
|
||||
[[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
|
||||
|
||||
/**
|
||||
* @brief Get an accessor that can be passed into CUDA kernels.
|
||||
*/
|
||||
[[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
|
||||
DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
|
||||
/**
|
||||
* @brief Get an accessor for host code.
|
||||
*/
|
||||
[[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
|
||||
Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
|
||||
common::Span<FeatureType const> feature_types = {}) const;
|
||||
|
||||
private:
|
||||
/*!
|
||||
* \brief Compress a single page of CSR data into ELLPACK.
|
||||
/**
|
||||
* @brief Compress a single page of CSR data into ELLPACK.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param row_batch The CSR page.
|
||||
*/
|
||||
void CreateHistIndices(DeviceOrd device,
|
||||
const SparsePage& row_batch,
|
||||
void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
|
||||
common::Span<FeatureType const> feature_types);
|
||||
/*!
|
||||
* \brief Initialize the buffer to store compressed features.
|
||||
/**
|
||||
* @brief Initialize the buffer to store compressed features.
|
||||
*/
|
||||
void InitCompressedData(DeviceOrd device);
|
||||
void InitCompressedData(Context const* ctx);
|
||||
|
||||
public:
|
||||
/*! \brief Whether or not if the matrix is dense. */
|
||||
/** @brief Whether or not if the matrix is dense. */
|
||||
bool is_dense;
|
||||
/*! \brief Row length for ELLPACK. */
|
||||
/** @brief Row length for ELLPACK. */
|
||||
bst_idx_t row_stride;
|
||||
bst_idx_t base_rowid{0};
|
||||
bst_idx_t n_rows{};
|
||||
/*! \brief global index of histogram, which is stored in ELLPACK format. */
|
||||
HostDeviceVector<common::CompressedByteT> gidx_buffer;
|
||||
bst_idx_t n_rows{0};
|
||||
/**
|
||||
* @brief Index of the gradient histogram, which is stored in ELLPACK format.
|
||||
*
|
||||
* This can be backed by various storage types.
|
||||
*/
|
||||
common::RefResourceView<common::CompressedByteT> gidx_buffer;
|
||||
|
||||
private:
|
||||
std::shared_ptr<common::HistogramCuts const> cuts_;
|
||||
|
||||
@@ -4,11 +4,12 @@
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for uint64_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
|
||||
#include "ellpack_page.cuh" // for EllpackPage
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc
|
||||
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
|
||||
#include "ellpack_page.cuh" // for EllpackPage
|
||||
#include "ellpack_page_raw_format.h"
|
||||
#include "ellpack_page_source.h"
|
||||
|
||||
@@ -16,8 +17,10 @@ namespace xgboost::data {
|
||||
DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
|
||||
|
||||
namespace {
|
||||
// Function to support system without HMM or ATS
|
||||
template <typename T>
|
||||
[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, HostDeviceVector<T>* vec) {
|
||||
[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
|
||||
common::RefResourceView<T>* vec) {
|
||||
std::uint64_t n{0};
|
||||
if (!fi->Read(&n)) {
|
||||
return false;
|
||||
@@ -33,34 +36,34 @@ template <typename T>
|
||||
return false;
|
||||
}
|
||||
|
||||
vec->Resize(n);
|
||||
auto d_vec = vec->DeviceSpan();
|
||||
dh::safe_cuda(
|
||||
cudaMemcpyAsync(d_vec.data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
|
||||
auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
*vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0));
|
||||
dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
|
||||
return true;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
#define RET_IF_NOT(expr) \
|
||||
if (!(expr)) { \
|
||||
return false; \
|
||||
}
|
||||
|
||||
[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
|
||||
common::AlignedResourceReadStream* fi) {
|
||||
auto* impl = page->Impl();
|
||||
|
||||
impl->SetCuts(this->cuts_);
|
||||
if (!fi->Read(&impl->n_rows)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->is_dense)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->row_stride)) {
|
||||
return false;
|
||||
}
|
||||
impl->gidx_buffer.SetDevice(device_);
|
||||
if (!ReadDeviceVec(fi, &impl->gidx_buffer)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->base_rowid)) {
|
||||
return false;
|
||||
RET_IF_NOT(fi->Read(&impl->n_rows));
|
||||
RET_IF_NOT(fi->Read(&impl->is_dense));
|
||||
RET_IF_NOT(fi->Read(&impl->row_stride));
|
||||
|
||||
if (has_hmm_ats_) {
|
||||
RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
|
||||
} else {
|
||||
RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
|
||||
}
|
||||
RET_IF_NOT(fi->Read(&impl->base_rowid));
|
||||
dh::DefaultStream().Sync();
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -71,8 +74,10 @@ template <typename T>
|
||||
bytes += fo->Write(impl->n_rows);
|
||||
bytes += fo->Write(impl->is_dense);
|
||||
bytes += fo->Write(impl->row_stride);
|
||||
CHECK(!impl->gidx_buffer.ConstHostVector().empty());
|
||||
bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
[[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
|
||||
bytes += common::WriteVec(fo, h_gidx_buffer);
|
||||
bytes += fo->Write(impl->base_rowid);
|
||||
dh::DefaultStream().Sync();
|
||||
return bytes;
|
||||
@@ -82,33 +87,20 @@ template <typename T>
|
||||
auto* impl = page->Impl();
|
||||
CHECK(this->cuts_->cut_values_.DeviceCanRead());
|
||||
impl->SetCuts(this->cuts_);
|
||||
if (!fi->Read(&impl->n_rows)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->is_dense)) {
|
||||
return false;
|
||||
}
|
||||
if (!fi->Read(&impl->row_stride)) {
|
||||
return false;
|
||||
}
|
||||
RET_IF_NOT(fi->Read(&impl->n_rows));
|
||||
RET_IF_NOT(fi->Read(&impl->is_dense));
|
||||
RET_IF_NOT(fi->Read(&impl->row_stride));
|
||||
|
||||
// Read vec
|
||||
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
bst_idx_t n{0};
|
||||
if (!fi->Read(&n)) {
|
||||
return false;
|
||||
}
|
||||
RET_IF_NOT(fi->Read(&n));
|
||||
if (n != 0) {
|
||||
impl->gidx_buffer.SetDevice(device_);
|
||||
impl->gidx_buffer.Resize(n);
|
||||
auto span = impl->gidx_buffer.DeviceSpan();
|
||||
if (!fi->Read(span.data(), span.size_bytes())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fi->Read(&impl->base_rowid)) {
|
||||
return false;
|
||||
impl->gidx_buffer =
|
||||
common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
|
||||
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
|
||||
}
|
||||
RET_IF_NOT(fi->Read(&impl->base_rowid));
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
return true;
|
||||
@@ -123,16 +115,17 @@ template <typename T>
|
||||
bytes += fo->Write(impl->row_stride);
|
||||
|
||||
// Write vector
|
||||
bst_idx_t n = impl->gidx_buffer.Size();
|
||||
bst_idx_t n = impl->gidx_buffer.size();
|
||||
bytes += fo->Write(n);
|
||||
|
||||
if (!impl->gidx_buffer.Empty()) {
|
||||
auto span = impl->gidx_buffer.ConstDeviceSpan();
|
||||
bytes += fo->Write(span.data(), span.size_bytes());
|
||||
if (!impl->gidx_buffer.empty()) {
|
||||
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
|
||||
}
|
||||
bytes += fo->Write(impl->base_rowid);
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
return bytes;
|
||||
}
|
||||
|
||||
#undef RET_IF_NOT
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -26,10 +26,13 @@ class EllpackHostCacheStream;
|
||||
class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
|
||||
std::shared_ptr<common::HistogramCuts const> cuts_;
|
||||
DeviceOrd device_;
|
||||
// Supports CUDA HMM or ATS
|
||||
bool has_hmm_ats_{false};
|
||||
|
||||
public:
|
||||
explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device)
|
||||
: cuts_{std::move(cuts)}, device_{device} {}
|
||||
explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
|
||||
bool has_hmm_ats)
|
||||
: cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {}
|
||||
[[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
|
||||
[[nodiscard]] std::size_t Write(const EllpackPage& page,
|
||||
common::AlignedFileWriteStream* fo) override;
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "../common/common.h" // for safe_cuda
|
||||
#include "../common/cuda_pinned_allocator.h" // for pinned_allocator
|
||||
#include "../common/device_helpers.cuh" // for CUDAStreamView, DefaultStream
|
||||
#include "../common/resource.cuh" // for PrivateCudaMmapConstStream
|
||||
#include "ellpack_page.cuh" // for EllpackPageImpl
|
||||
#include "ellpack_page.h" // for EllpackPage
|
||||
#include "ellpack_page_source.h"
|
||||
@@ -86,16 +87,16 @@ void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek(
|
||||
void EllpackHostCacheStream::Bound(bst_idx_t offset_bytes) { this->p_impl_->Bound(offset_bytes); }
|
||||
|
||||
/**
|
||||
* EllpackFormatType
|
||||
* EllpackCacheStreamPolicy
|
||||
*/
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
EllpackFormatStreamPolicy<S, F>::EllpackFormatStreamPolicy()
|
||||
EllpackCacheStreamPolicy<S, F>::EllpackCacheStreamPolicy()
|
||||
: p_cache_{std::make_shared<EllpackHostCache>()} {}
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::WriterT>
|
||||
EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
|
||||
[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::WriterT>
|
||||
EllpackCacheStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
|
||||
auto fo = std::make_unique<EllpackHostCacheStream>(this->p_cache_);
|
||||
if (iter == 0) {
|
||||
CHECK(this->p_cache_->cache.empty());
|
||||
@@ -106,9 +107,8 @@ EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
|
||||
}
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::ReaderT>
|
||||
EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
|
||||
bst_idx_t length) const {
|
||||
[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::ReaderT>
|
||||
EllpackCacheStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset, bst_idx_t length) const {
|
||||
auto fi = std::make_unique<ReaderT>(this->p_cache_);
|
||||
fi->Seek(offset);
|
||||
fi->Bound(offset + length);
|
||||
@@ -117,18 +117,40 @@ EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
|
||||
}
|
||||
|
||||
// Instantiation
|
||||
template EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackFormatStreamPolicy();
|
||||
template EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackCacheStreamPolicy();
|
||||
|
||||
template std::unique_ptr<
|
||||
typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
|
||||
EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
|
||||
std::uint32_t iter);
|
||||
typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
|
||||
EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
|
||||
std::uint32_t iter);
|
||||
|
||||
template std::unique_ptr<
|
||||
typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
|
||||
EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
|
||||
typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
|
||||
EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
|
||||
StringView name, std::uint64_t offset, std::uint64_t length) const;
|
||||
|
||||
/**
|
||||
* EllpackMmapStreamPolicy
|
||||
*/
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
[[nodiscard]] std::unique_ptr<typename EllpackMmapStreamPolicy<S, F>::ReaderT>
|
||||
EllpackMmapStreamPolicy<S, F>::CreateReader(StringView name, bst_idx_t offset,
|
||||
bst_idx_t length) const {
|
||||
if (has_hmm_) {
|
||||
return std::make_unique<common::PrivateCudaMmapConstStream>(name, offset, length);
|
||||
} else {
|
||||
return std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
|
||||
}
|
||||
}
|
||||
|
||||
// Instantiation
|
||||
template std::unique_ptr<
|
||||
typename EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
|
||||
EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringView name,
|
||||
bst_idx_t offset,
|
||||
bst_idx_t length) const;
|
||||
|
||||
/**
|
||||
* EllpackPageSourceImpl
|
||||
*/
|
||||
@@ -146,8 +168,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
|
||||
auto const& csr = this->source_->Page();
|
||||
this->page_.reset(new EllpackPage{});
|
||||
auto* impl = this->page_->Impl();
|
||||
*impl = EllpackPageImpl{this->Device(), this->GetCuts(), *csr,
|
||||
is_dense_, row_stride_, feature_types_};
|
||||
Context ctx = Context{}.MakeCUDA(this->Device().ordinal);
|
||||
*impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
|
||||
this->page_->SetBaseRowId(csr->base_rowid);
|
||||
this->WriteCache();
|
||||
}
|
||||
@@ -157,5 +179,7 @@ void EllpackPageSourceImpl<F>::Fetch() {
|
||||
template void
|
||||
EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
|
||||
template void
|
||||
EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
|
||||
EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
|
||||
template void
|
||||
EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <memory> // for shared_ptr
|
||||
#include <utility> // for move
|
||||
|
||||
#include "../common/cuda_rt_utils.h" // for SupportsPageableMem
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "ellpack_page.h" // for EllpackPage
|
||||
#include "ellpack_page_raw_format.h" // for EllpackPageRawFormat
|
||||
@@ -59,14 +60,19 @@ template <typename S>
|
||||
class EllpackFormatPolicy {
|
||||
std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
|
||||
DeviceOrd device_;
|
||||
bool has_hmm_{common::SupportsPageableMem()};
|
||||
|
||||
public:
|
||||
using FormatT = EllpackPageRawFormat;
|
||||
|
||||
public:
|
||||
EllpackFormatPolicy() = default;
|
||||
// For testing with the HMM flag.
|
||||
explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
|
||||
|
||||
[[nodiscard]] auto CreatePageFormat() const {
|
||||
CHECK_EQ(cuts_->cut_values_.Device(), device_);
|
||||
std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_}};
|
||||
std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}};
|
||||
return fmt;
|
||||
}
|
||||
|
||||
@@ -83,7 +89,7 @@ class EllpackFormatPolicy {
|
||||
};
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
class EllpackFormatStreamPolicy : public F<S> {
|
||||
class EllpackCacheStreamPolicy : public F<S> {
|
||||
std::shared_ptr<EllpackHostCache> p_cache_;
|
||||
|
||||
public:
|
||||
@@ -91,13 +97,42 @@ class EllpackFormatStreamPolicy : public F<S> {
|
||||
using ReaderT = EllpackHostCacheStream;
|
||||
|
||||
public:
|
||||
EllpackFormatStreamPolicy();
|
||||
EllpackCacheStreamPolicy();
|
||||
[[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter);
|
||||
|
||||
[[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
|
||||
bst_idx_t length) const;
|
||||
};
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
class EllpackMmapStreamPolicy : public F<S> {
|
||||
bool has_hmm_{common::SupportsPageableMem()};
|
||||
|
||||
public:
|
||||
using WriterT = common::AlignedFileWriteStream;
|
||||
using ReaderT = common::AlignedResourceReadStream;
|
||||
|
||||
public:
|
||||
EllpackMmapStreamPolicy() = default;
|
||||
// For testing with the HMM flag.
|
||||
template <
|
||||
typename std::enable_if_t<std::is_same_v<F<S>, EllpackFormatPolicy<EllpackPage>>>* = nullptr>
|
||||
explicit EllpackMmapStreamPolicy(bool has_hmm) : F<S>{has_hmm}, has_hmm_{has_hmm} {}
|
||||
|
||||
[[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter) {
|
||||
std::unique_ptr<common::AlignedFileWriteStream> fo;
|
||||
if (iter == 0) {
|
||||
fo = std::make_unique<common::AlignedFileWriteStream>(name, "wb");
|
||||
} else {
|
||||
fo = std::make_unique<common::AlignedFileWriteStream>(name, "ab");
|
||||
}
|
||||
return fo;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
|
||||
bst_idx_t length) const;
|
||||
};
|
||||
|
||||
template <typename F>
|
||||
class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
|
||||
using Super = PageSourceIncMixIn<EllpackPage, F>;
|
||||
@@ -128,11 +163,11 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
|
||||
|
||||
// Cache to host
|
||||
using EllpackPageHostSource =
|
||||
EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
|
||||
EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
|
||||
|
||||
// Cache to disk
|
||||
using EllpackPageSource =
|
||||
EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
|
||||
EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
template <typename F>
|
||||
|
||||
@@ -16,7 +16,8 @@ template <typename BinT, typename CompressOffset>
|
||||
void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
|
||||
std::vector<size_t>* p_hit_count_tloc, CompressOffset&& get_offset,
|
||||
GHistIndexMatrix* out) {
|
||||
auto accessor = page->GetHostAccessor();
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||
auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
|
||||
auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
|
||||
|
||||
common::Span<BinT> index_data_span = {out->index.data<BinT>(), out->index.Size()};
|
||||
@@ -47,7 +48,8 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
|
||||
if (page->is_dense) {
|
||||
std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
|
||||
} else {
|
||||
auto accessor = page->GetHostAccessor();
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||
auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
|
||||
auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
|
||||
|
||||
common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
/**
|
||||
* Copyright 2021-2024, XGBoost contributors
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||
#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||
|
||||
#include <dmlc/io.h> // for Stream
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
|
||||
#include "../common/ref_resource_view.h" // for WriteVec, ReadVec
|
||||
|
||||
namespace xgboost::data {
|
||||
inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
|
||||
if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
bool has_cat{false};
|
||||
if (!fi->Read(&has_cat)) {
|
||||
return false;
|
||||
}
|
||||
decltype(cuts->MaxCategory()) max_cat{0};
|
||||
if (!fi->Read(&max_cat)) {
|
||||
return false;
|
||||
}
|
||||
cuts->SetCategorical(has_cat, max_cat);
|
||||
return true;
|
||||
}
|
||||
|
||||
inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
|
||||
common::AlignedFileWriteStream *fo) {
|
||||
std::size_t bytes = 0;
|
||||
bytes += common::WriteVec(fo, cuts.Values());
|
||||
bytes += common::WriteVec(fo, cuts.Ptrs());
|
||||
bytes += common::WriteVec(fo, cuts.MinValues());
|
||||
bytes += fo->Write(cuts.HasCategorical());
|
||||
bytes += fo->Write(cuts.MaxCategory());
|
||||
return bytes;
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
#endif // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <memory>
|
||||
|
||||
#include "../collective/allreduce.h"
|
||||
#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "batch_utils.h" // for RegenGHist
|
||||
#include "device_adapter.cuh"
|
||||
@@ -45,11 +46,17 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
|
||||
int32_t current_device;
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
auto get_ctx = [&]() {
|
||||
Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device);
|
||||
CHECK(!d_ctx.IsCPU());
|
||||
return d_ctx;
|
||||
};
|
||||
auto get_device = [&]() {
|
||||
auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device);
|
||||
CHECK(!d.IsCPU());
|
||||
return d;
|
||||
};
|
||||
fmat_ctx_ = get_ctx();
|
||||
|
||||
/**
|
||||
* Generate quantiles
|
||||
@@ -118,7 +125,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
// that case device id is invalid.
|
||||
ellpack_.reset(new EllpackPage);
|
||||
*(ellpack_->Impl()) =
|
||||
EllpackPageImpl(get_device(), cuts, this->IsDense(), row_stride, accumulated_rows);
|
||||
EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), row_stride, accumulated_rows);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -142,10 +149,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
proxy->Info().feature_types.SetDevice(get_device());
|
||||
auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
|
||||
auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
|
||||
return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
|
||||
d_feature_types, row_stride, rows, cuts);
|
||||
return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
|
||||
row_stride, rows, cuts);
|
||||
});
|
||||
size_t num_elements = ellpack_->Impl()->Copy(get_device(), &new_impl, offset);
|
||||
std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
|
||||
offset += num_elements;
|
||||
|
||||
proxy->Info().num_row_ = num_rows();
|
||||
|
||||
@@ -226,7 +226,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
|
||||
}
|
||||
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
|
||||
// to let user adjust number of pre-fetched batches when needed.
|
||||
std::int32_t kPrefetches = 3;
|
||||
std::int32_t constexpr kPrefetches = 3;
|
||||
std::int32_t n_prefetches = std::min(nthreads_, kPrefetches);
|
||||
n_prefetches = std::max(n_prefetches, 1);
|
||||
std::int32_t n_prefetch_batches = std::min(static_cast<bst_idx_t>(n_prefetches), n_batches_);
|
||||
|
||||
Reference in New Issue
Block a user