Partial rewrite EllpackPage (#5352)
This commit is contained in:
@@ -13,11 +13,24 @@ class EllpackPageImpl {};
|
||||
EllpackPage::EllpackPage() = default;
|
||||
|
||||
EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||
"EllpackPage is required";
|
||||
}
|
||||
|
||||
EllpackPage::~EllpackPage() {
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||
"EllpackPage is required";
|
||||
}
|
||||
|
||||
void EllpackPage::SetBaseRowId(size_t row_id) {
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||
"EllpackPage is required";
|
||||
}
|
||||
|
||||
size_t EllpackPage::Size() const {
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||
"EllpackPage is required";
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include "./ellpack_page.cuh"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/random.h"
|
||||
#include "./ellpack_page.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -17,13 +17,9 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)
|
||||
|
||||
EllpackPage::~EllpackPage() = default;
|
||||
|
||||
size_t EllpackPage::Size() const {
|
||||
return impl_->Size();
|
||||
}
|
||||
size_t EllpackPage::Size() const { return impl_->Size(); }
|
||||
|
||||
void EllpackPage::SetBaseRowId(size_t row_id) {
|
||||
impl_->SetBaseRowId(row_id);
|
||||
}
|
||||
void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); }
|
||||
|
||||
// Bin each input data entry, store the bin indices in compressed form.
|
||||
__global__ void CompressBinEllpackKernel(
|
||||
@@ -65,16 +61,18 @@ __global__ void CompressBinEllpackKernel(
|
||||
}
|
||||
|
||||
// Construct an ELLPACK matrix with the given number of empty rows.
|
||||
EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) {
|
||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
bool is_dense, size_t row_stride,
|
||||
size_t n_rows)
|
||||
: is_dense(is_dense),
|
||||
cuts_(std::move(cuts)),
|
||||
row_stride(row_stride),
|
||||
n_rows(n_rows) {
|
||||
monitor_.Init("ellpack_page");
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
matrix.info = info;
|
||||
matrix.base_rowid = 0;
|
||||
matrix.n_rows = n_rows;
|
||||
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
InitCompressedData(device, n_rows);
|
||||
InitCompressedData(device);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
}
|
||||
|
||||
@@ -93,33 +91,27 @@ size_t GetRowStride(DMatrix* dmat) {
|
||||
}
|
||||
|
||||
// Construct an ELLPACK matrix in memory.
|
||||
EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
|
||||
EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
|
||||
: is_dense(dmat->IsDense()) {
|
||||
monitor_.Init("ellpack_page");
|
||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||
|
||||
matrix.n_rows = dmat->Info().num_row_;
|
||||
n_rows = dmat->Info().num_row_;
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
|
||||
size_t row_stride = GetRowStride(dmat);
|
||||
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
|
||||
row_stride = GetRowStride(dmat);
|
||||
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
|
||||
param.gpu_batch_nrows);
|
||||
monitor_.StopCuda("Quantiles");
|
||||
|
||||
monitor_.StartCuda("InitEllpackInfo");
|
||||
InitInfo(param.gpu_id, dmat->IsDense(), row_stride, cuts);
|
||||
monitor_.StopCuda("InitEllpackInfo");
|
||||
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
InitCompressedData(param.gpu_id, dmat->Info().num_row_);
|
||||
InitCompressedData(param.gpu_id);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
|
||||
monitor_.StartCuda("BinningCompression");
|
||||
DeviceHistogramBuilderState hist_builder_row_state(dmat->Info().num_row_);
|
||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||
hist_builder_row_state.BeginBatch(batch);
|
||||
CreateHistIndices(param.gpu_id, batch, hist_builder_row_state.GetRowStateOnDevice());
|
||||
hist_builder_row_state.EndBatch();
|
||||
CreateHistIndices(param.gpu_id, batch);
|
||||
}
|
||||
monitor_.StopCuda("BinningCompression");
|
||||
}
|
||||
@@ -133,23 +125,26 @@ struct CopyPage {
|
||||
size_t offset;
|
||||
|
||||
CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
|
||||
: cbw{dst->matrix.info.NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.data()},
|
||||
src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
|
||||
: cbw{dst->NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||
offset(offset) {}
|
||||
|
||||
__device__ void operator()(size_t element_id) {
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
|
||||
element_id + offset);
|
||||
}
|
||||
};
|
||||
|
||||
// Copy the data from the given EllpackPage to the current page.
|
||||
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
|
||||
monitor_.StartCuda("Copy");
|
||||
size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
|
||||
CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
|
||||
CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
|
||||
CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements);
|
||||
size_t num_elements = page->n_rows * page->row_stride;
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||
CHECK_GE(n_rows * row_stride, offset + num_elements);
|
||||
gidx_buffer.SetDevice(device);
|
||||
page->gidx_buffer.SetDevice(device);
|
||||
dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
|
||||
monitor_.StopCuda("Copy");
|
||||
return num_elements;
|
||||
@@ -160,26 +155,29 @@ struct CompactPage {
|
||||
common::CompressedBufferWriter cbw;
|
||||
common::CompressedByteT* dst_data_d;
|
||||
common::CompressedIterator<uint32_t> src_iterator_d;
|
||||
/*! \brief An array that maps the rows from the full DMatrix to the compacted page.
|
||||
/*! \brief An array that maps the rows from the full DMatrix to the compacted
|
||||
* page.
|
||||
*
|
||||
* The total size is the number of rows in the original, uncompacted DMatrix. Elements are the
|
||||
* row ids in the compacted page. Rows not needed are set to SIZE_MAX.
|
||||
* The total size is the number of rows in the original, uncompacted DMatrix.
|
||||
* Elements are the row ids in the compacted page. Rows not needed are set to
|
||||
* SIZE_MAX.
|
||||
*
|
||||
* An example compacting 16 rows to 8 rows:
|
||||
* [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX,
|
||||
* SIZE_MAX]
|
||||
* [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6,
|
||||
* SIZE_MAX, 7, SIZE_MAX, SIZE_MAX]
|
||||
*/
|
||||
common::Span<size_t> row_indexes;
|
||||
size_t base_rowid;
|
||||
size_t row_stride;
|
||||
|
||||
CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span<size_t> row_indexes)
|
||||
: cbw{dst->matrix.info.NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.data()},
|
||||
src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
|
||||
CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src,
|
||||
common::Span<size_t> row_indexes)
|
||||
: cbw{dst->NumSymbols()},
|
||||
dst_data_d{dst->gidx_buffer.DevicePointer()},
|
||||
src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
|
||||
row_indexes(row_indexes),
|
||||
base_rowid{src->matrix.base_rowid},
|
||||
row_stride{src->matrix.info.row_stride} {}
|
||||
base_rowid{src->base_rowid},
|
||||
row_stride{src->row_stride} {}
|
||||
|
||||
__device__ void operator()(size_t row_id) {
|
||||
size_t src_row = base_rowid + row_id;
|
||||
@@ -188,100 +186,72 @@ struct CompactPage {
|
||||
size_t dst_offset = dst_row * row_stride;
|
||||
size_t src_offset = row_id * row_stride;
|
||||
for (size_t j = 0; j < row_stride; j++) {
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
|
||||
cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
|
||||
dst_offset + j);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Compacts the data from the given EllpackPage into the current page.
|
||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes) {
|
||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
|
||||
common::Span<size_t> row_indexes) {
|
||||
monitor_.StartCuda("Compact");
|
||||
CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
|
||||
CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
|
||||
CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size());
|
||||
dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes));
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
|
||||
gidx_buffer.SetDevice(device);
|
||||
page->gidx_buffer.SetDevice(device);
|
||||
dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
|
||||
monitor_.StopCuda("Compact");
|
||||
}
|
||||
|
||||
// Construct an EllpackInfo based on histogram cuts of features.
|
||||
EllpackInfo::EllpackInfo(int device,
|
||||
bool is_dense,
|
||||
size_t row_stride,
|
||||
const common::HistogramCuts& hmat,
|
||||
dh::BulkAllocator* ba)
|
||||
: is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {
|
||||
|
||||
ba->Allocate(device,
|
||||
&feature_segments, hmat.Ptrs().size(),
|
||||
&gidx_fvalue_map, hmat.Values().size(),
|
||||
&min_fvalue, hmat.MinValues().size());
|
||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
|
||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
|
||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
|
||||
}
|
||||
|
||||
// Initialize the EllpackInfo for this page.
|
||||
void EllpackPageImpl::InitInfo(int device,
|
||||
bool is_dense,
|
||||
size_t row_stride,
|
||||
const common::HistogramCuts& hmat) {
|
||||
matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
|
||||
}
|
||||
|
||||
// Initialize the buffer to stored compressed features.
|
||||
void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) {
|
||||
size_t num_symbols = matrix.info.NumSymbols();
|
||||
void EllpackPageImpl::InitCompressedData(int device) {
|
||||
size_t num_symbols = NumSymbols();
|
||||
|
||||
// Required buffer size for storing data matrix in ELLPack format.
|
||||
size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
|
||||
matrix.info.row_stride * num_rows, num_symbols);
|
||||
ba_.Allocate(device, &gidx_buffer, compressed_size_bytes);
|
||||
|
||||
thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
|
||||
|
||||
matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
|
||||
size_t compressed_size_bytes =
|
||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
|
||||
num_symbols);
|
||||
gidx_buffer.SetDevice(device);
|
||||
// Don't call fill unnecessarily
|
||||
if (gidx_buffer.Size() == 0) {
|
||||
gidx_buffer.Resize(compressed_size_bytes, 0);
|
||||
} else {
|
||||
gidx_buffer.Resize(compressed_size_bytes, 0);
|
||||
thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Compress a CSR page into ELLPACK.
|
||||
void EllpackPageImpl::CreateHistIndices(int device,
|
||||
const SparsePage& row_batch,
|
||||
const RowStateOnDevice& device_row_state) {
|
||||
// Has any been allocated for me in this batch?
|
||||
if (!device_row_state.rows_to_process_from_batch) return;
|
||||
|
||||
unsigned int null_gidx_value = matrix.info.n_bins;
|
||||
size_t row_stride = matrix.info.row_stride;
|
||||
const SparsePage& row_batch) {
|
||||
if (row_batch.Size() == 0) return;
|
||||
unsigned int null_gidx_value = NumSymbols() - 1;
|
||||
|
||||
const auto& offset_vec = row_batch.offset.ConstHostVector();
|
||||
|
||||
// bin and compress entries in batches of rows
|
||||
size_t gpu_batch_nrows = std::min(
|
||||
dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
|
||||
static_cast<size_t>(device_row_state.rows_to_process_from_batch));
|
||||
size_t gpu_batch_nrows =
|
||||
std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
|
||||
static_cast<size_t>(row_batch.Size()));
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
|
||||
gpu_batch_nrows);
|
||||
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
|
||||
|
||||
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
|
||||
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
|
||||
size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
|
||||
if (batch_row_end > device_row_state.rows_to_process_from_batch) {
|
||||
batch_row_end = device_row_state.rows_to_process_from_batch;
|
||||
}
|
||||
size_t batch_row_end =
|
||||
std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
|
||||
size_t batch_nrows = batch_row_end - batch_row_begin;
|
||||
|
||||
const auto ent_cnt_begin =
|
||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
|
||||
const auto ent_cnt_end =
|
||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
|
||||
const auto ent_cnt_begin = offset_vec[batch_row_begin];
|
||||
const auto ent_cnt_end = offset_vec[batch_row_end];
|
||||
|
||||
/*! \brief row offset in SparsePage (the input data). */
|
||||
dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
|
||||
thrust::copy(
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
|
||||
row_ptrs.begin());
|
||||
thrust::copy(offset_vec.data() + batch_row_begin,
|
||||
offset_vec.data() + batch_row_end + 1, row_ptrs.begin());
|
||||
|
||||
// number of entries in this batch.
|
||||
size_t n_entries = ent_cnt_end - ent_cnt_begin;
|
||||
@@ -289,97 +259,50 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
// copy data entries to device.
|
||||
dh::safe_cuda(cudaMemcpy(entries_d.data().get(),
|
||||
data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry),
|
||||
cudaMemcpyDefault));
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
const dim3 block3(32, 8, 1); // 256 threads
|
||||
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||
common::DivRoundUp(row_stride, block3.y),
|
||||
1);
|
||||
dh::LaunchKernel {grid3, block3} (
|
||||
CompressBinEllpackKernel,
|
||||
common::CompressedBufferWriter(matrix.info.NumSymbols()),
|
||||
gidx_buffer.data(),
|
||||
row_ptrs.data().get(),
|
||||
entries_d.data().get(),
|
||||
matrix.info.gidx_fvalue_map.data(),
|
||||
matrix.info.feature_segments.data(),
|
||||
device_row_state.total_rows_processed + batch_row_begin,
|
||||
batch_nrows,
|
||||
row_stride,
|
||||
common::DivRoundUp(row_stride, block3.y), 1);
|
||||
auto device_accessor = GetDeviceAccessor(device);
|
||||
dh::LaunchKernel {grid3, block3}(
|
||||
CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
|
||||
gidx_buffer.DevicePointer(), row_ptrs.data().get(),
|
||||
entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
|
||||
device_accessor.feature_segments.data(),
|
||||
row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride,
|
||||
null_gidx_value);
|
||||
}
|
||||
}
|
||||
|
||||
// Return the number of rows contained in this page.
|
||||
size_t EllpackPageImpl::Size() const {
|
||||
return matrix.n_rows;
|
||||
}
|
||||
|
||||
// Clear the current page.
|
||||
void EllpackPageImpl::Clear() {
|
||||
ba_.Clear();
|
||||
gidx_buffer = {};
|
||||
idx_buffer.clear();
|
||||
sparse_page_.Clear();
|
||||
matrix.base_rowid = 0;
|
||||
matrix.n_rows = 0;
|
||||
device_initialized_ = false;
|
||||
}
|
||||
|
||||
// Push a CSR page to the current page.
|
||||
//
|
||||
// The CSR pages are accumulated in memory until they reach a certain size, then written out as
|
||||
// compressed ELLPACK.
|
||||
void EllpackPageImpl::Push(int device, const SparsePage& batch) {
|
||||
sparse_page_.Push(batch);
|
||||
matrix.n_rows += batch.Size();
|
||||
}
|
||||
|
||||
// Compress the accumulated SparsePage.
|
||||
void EllpackPageImpl::CompressSparsePage(int device) {
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
InitCompressedData(device, matrix.n_rows);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
|
||||
monitor_.StartCuda("BinningCompression");
|
||||
DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
|
||||
hist_builder_row_state.BeginBatch(sparse_page_);
|
||||
CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
|
||||
hist_builder_row_state.EndBatch();
|
||||
monitor_.StopCuda("BinningCompression");
|
||||
|
||||
monitor_.StartCuda("CopyDeviceToHost");
|
||||
idx_buffer.resize(gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
|
||||
ba_.Clear();
|
||||
gidx_buffer = {};
|
||||
monitor_.StopCuda("CopyDeviceToHost");
|
||||
}
|
||||
size_t EllpackPageImpl::Size() const { return n_rows; }
|
||||
|
||||
// Return the memory cost for storing the compressed features.
|
||||
size_t EllpackPageImpl::MemCostBytes() const {
|
||||
// Required buffer size for storing data matrix in ELLPack format.
|
||||
size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
|
||||
matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols());
|
||||
size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
|
||||
const common::HistogramCuts& cuts) {
|
||||
// Required buffer size for storing data matrix in EtoLLPack format.
|
||||
size_t compressed_size_bytes =
|
||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows,
|
||||
cuts.TotalBins() + 1);
|
||||
return compressed_size_bytes;
|
||||
}
|
||||
|
||||
// Copy the compressed features to GPU.
|
||||
void EllpackPageImpl::InitDevice(int device, EllpackInfo info) {
|
||||
if (device_initialized_) return;
|
||||
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const {
|
||||
gidx_buffer.SetDevice(device);
|
||||
return EllpackDeviceAccessor(
|
||||
device, cuts_, is_dense, row_stride, base_rowid, n_rows,
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
|
||||
NumSymbols()));
|
||||
}
|
||||
|
||||
monitor_.StartCuda("CopyPageToDevice");
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
gidx_buffer = {};
|
||||
ba_.Allocate(device, &gidx_buffer, idx_buffer.size());
|
||||
dh::CopyVectorToDeviceSpan(gidx_buffer, idx_buffer);
|
||||
|
||||
matrix.info = info;
|
||||
matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), info.n_bins + 1);
|
||||
|
||||
monitor_.StopCuda("CopyPageToDevice");
|
||||
|
||||
device_initialized_ = true;
|
||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
const SparsePage& page, bool is_dense,
|
||||
size_t row_stride)
|
||||
: cuts_(std::move(cuts)),
|
||||
is_dense(is_dense),
|
||||
n_rows(page.Size()),
|
||||
row_stride(row_stride) {
|
||||
this->InitCompressedData(device);
|
||||
this->CreateHistIndices(device, page);
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -40,71 +40,53 @@ __forceinline__ __device__ int BinarySearchRow(
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** \brief Meta information about the ELLPACK matrix. */
|
||||
struct EllpackInfo {
|
||||
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
||||
* device. Does not own underlying memory and may be trivially copied into
|
||||
* kernels.*/
|
||||
struct EllpackDeviceAccessor {
|
||||
/*! \brief Whether or not if the matrix is dense. */
|
||||
bool is_dense;
|
||||
/*! \brief Row length for ELLPack, equal to number of features. */
|
||||
size_t row_stride;
|
||||
/*! \brief Total number of bins, also used as the null index value, . */
|
||||
size_t n_bins;
|
||||
/*! \brief Minimum value for each feature. Size equals to number of features. */
|
||||
common::Span<bst_float> min_fvalue;
|
||||
/*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
|
||||
common::Span<uint32_t> feature_segments;
|
||||
/*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
|
||||
common::Span<bst_float> gidx_fvalue_map;
|
||||
|
||||
EllpackInfo() = default;
|
||||
|
||||
/*!
|
||||
* \brief Constructor.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param is_dense Whether the matrix is dense.
|
||||
* @param row_stride The number of features between starts of consecutive rows.
|
||||
* @param hmat The histogram cuts of all the features.
|
||||
* @param ba The BulkAllocator that owns the GPU memory.
|
||||
*/
|
||||
explicit EllpackInfo(int device,
|
||||
bool is_dense,
|
||||
size_t row_stride,
|
||||
const common::HistogramCuts& hmat,
|
||||
dh::BulkAllocator* ba);
|
||||
|
||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for not found). */
|
||||
size_t NumSymbols() const {
|
||||
return n_bins + 1;
|
||||
}
|
||||
size_t NumFeatures() const {
|
||||
return min_fvalue.size();
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
||||
* device. Does not own underlying memory and may be trivially copied into
|
||||
* kernels.*/
|
||||
struct EllpackMatrix {
|
||||
EllpackInfo info;
|
||||
size_t base_rowid{};
|
||||
size_t n_rows{};
|
||||
common::CompressedIterator<uint32_t> gidx_iter;
|
||||
/*! \brief Minimum value for each feature. Size equals to number of features. */
|
||||
common::Span<const bst_float> min_fvalue;
|
||||
/*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
|
||||
common::Span<const uint32_t> feature_segments;
|
||||
/*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
|
||||
common::Span<const bst_float> gidx_fvalue_map;
|
||||
|
||||
EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
|
||||
bool is_dense, size_t row_stride, size_t base_rowid,
|
||||
size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter)
|
||||
: is_dense(is_dense),
|
||||
row_stride(row_stride),
|
||||
base_rowid(base_rowid),
|
||||
n_rows(n_rows) ,gidx_iter(gidx_iter){
|
||||
cuts.cut_values_.SetDevice(device);
|
||||
cuts.cut_ptrs_.SetDevice(device);
|
||||
cuts.min_vals_.SetDevice(device);
|
||||
gidx_fvalue_map = cuts.cut_values_.ConstDeviceSpan();
|
||||
feature_segments = cuts.cut_ptrs_.ConstDeviceSpan();
|
||||
min_fvalue = cuts.min_vals_.ConstDeviceSpan();
|
||||
}
|
||||
// Get a matrix element, uses binary search for look up Return NaN if missing
|
||||
// Given a row index and a feature index, returns the corresponding cut value
|
||||
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
||||
ridx -= base_rowid;
|
||||
auto row_begin = info.row_stride * ridx;
|
||||
auto row_end = row_begin + info.row_stride;
|
||||
auto row_begin = row_stride * ridx;
|
||||
auto row_end = row_begin + row_stride;
|
||||
auto gidx = -1;
|
||||
if (info.is_dense) {
|
||||
if (is_dense) {
|
||||
gidx = gidx_iter[row_begin + fidx];
|
||||
} else {
|
||||
gidx = BinarySearchRow(row_begin,
|
||||
row_end,
|
||||
gidx_iter,
|
||||
info.feature_segments[fidx],
|
||||
info.feature_segments[fidx + 1]);
|
||||
feature_segments[fidx],
|
||||
feature_segments[fidx + 1]);
|
||||
}
|
||||
return gidx;
|
||||
}
|
||||
@@ -113,97 +95,27 @@ struct EllpackMatrix {
|
||||
if (gidx == -1) {
|
||||
return nan("");
|
||||
}
|
||||
return info.gidx_fvalue_map[gidx];
|
||||
return gidx_fvalue_map[gidx];
|
||||
}
|
||||
|
||||
// Check if the row id is withing range of the current batch.
|
||||
__device__ bool IsInRange(size_t row_id) const {
|
||||
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
||||
}
|
||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||
* not found). */
|
||||
size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
|
||||
|
||||
size_t NullValue() const { return gidx_fvalue_map.size(); }
|
||||
|
||||
XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
|
||||
|
||||
XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
|
||||
};
|
||||
|
||||
// Instances of this type are created while creating the histogram bins for the
|
||||
// entire dataset across multiple sparse page batches. This keeps track of the number
|
||||
// of rows to process from a batch and the position from which to process on each device.
|
||||
struct RowStateOnDevice {
|
||||
// Number of rows assigned to this device
|
||||
size_t total_rows_assigned_to_device;
|
||||
// Number of rows processed thus far
|
||||
size_t total_rows_processed;
|
||||
// Number of rows to process from the current sparse page batch
|
||||
size_t rows_to_process_from_batch;
|
||||
// Offset from the current sparse page batch to begin processing
|
||||
size_t row_offset_in_current_batch;
|
||||
|
||||
explicit RowStateOnDevice(size_t total_rows)
|
||||
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
||||
rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
|
||||
}
|
||||
|
||||
explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
|
||||
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
||||
rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
|
||||
}
|
||||
|
||||
// Advance the row state by the number of rows processed
|
||||
void Advance() {
|
||||
total_rows_processed += rows_to_process_from_batch;
|
||||
CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
|
||||
rows_to_process_from_batch = row_offset_in_current_batch = 0;
|
||||
}
|
||||
};
|
||||
|
||||
// An instance of this type is created which keeps track of total number of rows to process,
|
||||
// rows processed thus far, rows to process and the offset from the current sparse page batch
|
||||
// to begin processing on each device
|
||||
class DeviceHistogramBuilderState {
|
||||
public:
|
||||
explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {}
|
||||
|
||||
const RowStateOnDevice& GetRowStateOnDevice() const {
|
||||
return device_row_state_;
|
||||
}
|
||||
|
||||
// This method is invoked at the beginning of each sparse page batch. This distributes
|
||||
// the rows in the sparse page to the device.
|
||||
// TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
|
||||
void BeginBatch(const SparsePage &batch) {
|
||||
size_t rem_rows = batch.Size();
|
||||
size_t row_offset_in_current_batch = 0;
|
||||
|
||||
// Do we have anymore left to process from this batch on this device?
|
||||
if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
|
||||
// There are still some rows that needs to be assigned to this device
|
||||
device_row_state_.rows_to_process_from_batch =
|
||||
std::min(
|
||||
device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
|
||||
rem_rows);
|
||||
} else {
|
||||
// All rows have been assigned to this device
|
||||
device_row_state_.rows_to_process_from_batch = 0;
|
||||
}
|
||||
|
||||
device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
|
||||
row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
|
||||
rem_rows -= device_row_state_.rows_to_process_from_batch;
|
||||
}
|
||||
|
||||
// This method is invoked after completion of each sparse page batch
|
||||
void EndBatch() {
|
||||
device_row_state_.Advance();
|
||||
}
|
||||
|
||||
private:
|
||||
RowStateOnDevice device_row_state_{0};
|
||||
};
|
||||
|
||||
class EllpackPageImpl {
|
||||
public:
|
||||
EllpackMatrix matrix;
|
||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||
common::Span<common::CompressedByteT> gidx_buffer;
|
||||
std::vector<common::CompressedByteT> idx_buffer;
|
||||
|
||||
/*!
|
||||
* \brief Default constructor.
|
||||
*
|
||||
@@ -218,7 +130,12 @@ class EllpackPageImpl {
|
||||
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
||||
* and the given number of rows.
|
||||
*/
|
||||
explicit EllpackPageImpl(int device, EllpackInfo info, size_t n_rows);
|
||||
EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
|
||||
size_t row_stride, size_t n_rows);
|
||||
|
||||
EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
const SparsePage& page,
|
||||
bool is_dense,size_t row_stride);
|
||||
|
||||
/*!
|
||||
* \brief Constructor from an existing DMatrix.
|
||||
@@ -245,77 +162,53 @@ class EllpackPageImpl {
|
||||
*/
|
||||
void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);
|
||||
|
||||
/*!
|
||||
* \brief Initialize the EllpackInfo contained in the EllpackMatrix.
|
||||
*
|
||||
* This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns
|
||||
* the GPU memory used by the EllpackInfo.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param is_dense Whether the matrix is dense.
|
||||
* @param row_stride The number of features between starts of consecutive rows.
|
||||
* @param hmat The histogram cuts of all the features.
|
||||
*/
|
||||
void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat);
|
||||
|
||||
/*!
|
||||
* \brief Initialize the buffer to store compressed features.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param num_rows The number of rows we are storing in the buffer.
|
||||
*/
|
||||
void InitCompressedData(int device, size_t num_rows);
|
||||
|
||||
/*!
|
||||
* \brief Compress a single page of CSR data into ELLPACK.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param row_batch The CSR page.
|
||||
* @param device_row_state On-device data for maintaining state.
|
||||
*/
|
||||
void CreateHistIndices(int device,
|
||||
const SparsePage& row_batch,
|
||||
const RowStateOnDevice& device_row_state);
|
||||
|
||||
/*! \return Number of instances in the page. */
|
||||
size_t Size() const;
|
||||
|
||||
/*! \brief Set the base row id for this page. */
|
||||
inline void SetBaseRowId(size_t row_id) {
|
||||
matrix.base_rowid = row_id;
|
||||
void SetBaseRowId(size_t row_id) {
|
||||
base_rowid = row_id;
|
||||
}
|
||||
|
||||
/*! \brief clear the page. */
|
||||
void Clear();
|
||||
|
||||
/*!
|
||||
* \brief Push a sparse page.
|
||||
* \param batch The row page.
|
||||
*/
|
||||
void Push(int device, const SparsePage& batch);
|
||||
|
||||
/*! \return Estimation of memory cost of this page. */
|
||||
size_t MemCostBytes() const;
|
||||
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
|
||||
|
||||
/*!
|
||||
* \brief Copy the ELLPACK matrix to GPU.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param info The EllpackInfo for the matrix.
|
||||
*/
|
||||
void InitDevice(int device, EllpackInfo info);
|
||||
|
||||
/*! \brief Compress the accumulated SparsePage into ELLPACK format.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
*/
|
||||
void CompressSparsePage(int device);
|
||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||
* not found). */
|
||||
size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
|
||||
|
||||
EllpackDeviceAccessor GetDeviceAccessor(int device) const;
|
||||
|
||||
private:
|
||||
/*!
|
||||
* \brief Compress a single page of CSR data into ELLPACK.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
* @param row_batch The CSR page.
|
||||
*/
|
||||
void CreateHistIndices(int device,
|
||||
const SparsePage& row_batch
|
||||
);
|
||||
/*!
|
||||
* \brief Initialize the buffer to store compressed features.
|
||||
*/
|
||||
void InitCompressedData(int device);
|
||||
|
||||
|
||||
public:
|
||||
/*! \brief Whether or not if the matrix is dense. */
|
||||
bool is_dense;
|
||||
/*! \brief Row length for ELLPack. */
|
||||
size_t row_stride;
|
||||
size_t base_rowid{0};
|
||||
size_t n_rows{};
|
||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||
HostDeviceVector<common::CompressedByteT> gidx_buffer;
|
||||
common::HistogramCuts cuts_;
|
||||
private:
|
||||
common::Monitor monitor_;
|
||||
dh::BulkAllocator ba_;
|
||||
bool device_initialized_{false};
|
||||
SparsePage sparse_page_{};
|
||||
};
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -17,26 +17,35 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
|
||||
public:
|
||||
bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
|
||||
auto* impl = page->Impl();
|
||||
impl->Clear();
|
||||
if (!fi->Read(&impl->matrix.n_rows)) return false;
|
||||
return fi->Read(&impl->idx_buffer);
|
||||
fi->Read(&impl->cuts_.cut_values_.HostVector());
|
||||
fi->Read(&impl->cuts_.cut_ptrs_.HostVector());
|
||||
fi->Read(&impl->cuts_.min_vals_.HostVector());
|
||||
fi->Read(&impl->n_rows);
|
||||
fi->Read(&impl->is_dense);
|
||||
fi->Read(&impl->row_stride);
|
||||
if (!fi->Read(&impl->gidx_buffer.HostVector())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Read(EllpackPage* page,
|
||||
dmlc::SeekStream* fi,
|
||||
const std::vector<bst_uint>& sorted_index_set) override {
|
||||
auto* impl = page->Impl();
|
||||
impl->Clear();
|
||||
if (!fi->Read(&impl->matrix.n_rows)) return false;
|
||||
return fi->Read(&page->Impl()->idx_buffer);
|
||||
LOG(FATAL) << "Not implemented";
|
||||
return false;
|
||||
}
|
||||
|
||||
void Write(const EllpackPage& page, dmlc::Stream* fo) override {
|
||||
auto* impl = page.Impl();
|
||||
fo->Write(impl->matrix.n_rows);
|
||||
auto buffer = impl->idx_buffer;
|
||||
CHECK(!buffer.empty());
|
||||
fo->Write(buffer);
|
||||
fo->Write(impl->cuts_.cut_values_.ConstHostVector());
|
||||
fo->Write(impl->cuts_.cut_ptrs_.ConstHostVector());
|
||||
fo->Write(impl->cuts_.min_vals_.ConstHostVector());
|
||||
fo->Write(impl->n_rows);
|
||||
fo->Write(impl->is_dense);
|
||||
fo->Write(impl->row_stride);
|
||||
CHECK(!impl->gidx_buffer.ConstHostVector().empty());
|
||||
fo->Write(impl->gidx_buffer.HostVector());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -2,45 +2,23 @@
|
||||
* Copyright 2019 XGBoost contributors
|
||||
*/
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
#include <dmlc/base.h>
|
||||
#if DMLC_ENABLE_STD_THREAD
|
||||
|
||||
#include <xgboost/data.h>
|
||||
#include "ellpack_page_source.h"
|
||||
#include <xgboost/data.h>
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false) {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
}
|
||||
|
||||
void EllpackPageSource::BeforeFirst() {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
}
|
||||
|
||||
bool EllpackPageSource::Next() {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
return false;
|
||||
}
|
||||
|
||||
EllpackPage& EllpackPageSource::Value() {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
EllpackPage* page { nullptr };
|
||||
return *page;
|
||||
}
|
||||
|
||||
const EllpackPage& EllpackPageSource::Value() const {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
EllpackPage* page { nullptr };
|
||||
return *page;
|
||||
LOG(FATAL)
|
||||
<< "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
}
|
||||
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // DMLC_ENABLE_STD_THREAD
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
|
||||
@@ -3,73 +3,16 @@
|
||||
*/
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/hist_util.h"
|
||||
|
||||
#include "ellpack_page.cuh"
|
||||
#include "ellpack_page_source.h"
|
||||
#include "sparse_page_source.h"
|
||||
#include "ellpack_page.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
class EllpackPageSourceImpl : public DataSource<EllpackPage> {
|
||||
public:
|
||||
/*!
|
||||
* \brief Create source from cache files the cache_prefix.
|
||||
* \param cache_prefix The prefix of cache we want to solve.
|
||||
*/
|
||||
explicit EllpackPageSourceImpl(DMatrix* dmat,
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false);
|
||||
|
||||
/*! \brief destructor */
|
||||
~EllpackPageSourceImpl() override = default;
|
||||
|
||||
void BeforeFirst() override;
|
||||
bool Next() override;
|
||||
EllpackPage& Value();
|
||||
const EllpackPage& Value() const override;
|
||||
|
||||
private:
|
||||
/*! \brief Write Ellpack pages after accumulating them in memory. */
|
||||
void WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const;
|
||||
|
||||
/*! \brief The page type string for ELLPACK. */
|
||||
const std::string kPageType_{".ellpack.page"};
|
||||
|
||||
int device_{-1};
|
||||
size_t page_size_{DMatrix::kPageSize};
|
||||
common::Monitor monitor_;
|
||||
dh::BulkAllocator ba_;
|
||||
/*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
|
||||
EllpackInfo ellpack_info_;
|
||||
std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> source_;
|
||||
std::string cache_info_;
|
||||
};
|
||||
|
||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false)
|
||||
: impl_{new EllpackPageSourceImpl(dmat, cache_info, param)} {}
|
||||
|
||||
void EllpackPageSource::BeforeFirst() {
|
||||
impl_->BeforeFirst();
|
||||
}
|
||||
|
||||
bool EllpackPageSource::Next() {
|
||||
return impl_->Next();
|
||||
}
|
||||
|
||||
EllpackPage& EllpackPageSource::Value() {
|
||||
return impl_->Value();
|
||||
}
|
||||
|
||||
const EllpackPage& EllpackPageSource::Value() const {
|
||||
return impl_->Value();
|
||||
}
|
||||
|
||||
size_t GetRowStride(DMatrix* dmat) {
|
||||
if (dmat->IsDense()) return dmat->Info().num_col_;
|
||||
|
||||
@@ -86,17 +29,19 @@ size_t GetRowStride(DMatrix* dmat) {
|
||||
|
||||
// Build the quantile sketch across the whole input data, then use the histogram cuts to compress
|
||||
// each CSR page, and write the accumulated ELLPACK pages to disk.
|
||||
EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false)
|
||||
: device_(param.gpu_id), cache_info_(cache_info) {
|
||||
|
||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false) {
|
||||
cache_info_ = ParseCacheInfo(cache_info, kPageType_);
|
||||
for (auto file : cache_info_.name_shards) {
|
||||
CheckCacheFileExists(file);
|
||||
}
|
||||
if (param.gpu_page_size > 0) {
|
||||
page_size_ = param.gpu_page_size;
|
||||
}
|
||||
|
||||
monitor_.Init("ellpack_page_source");
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
size_t row_stride = GetRowStride(dmat);
|
||||
@@ -104,75 +49,52 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
||||
param.gpu_batch_nrows);
|
||||
monitor_.StopCuda("Quantiles");
|
||||
|
||||
monitor_.StartCuda("CreateEllpackInfo");
|
||||
ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, cuts, &ba_);
|
||||
monitor_.StopCuda("CreateEllpackInfo");
|
||||
|
||||
monitor_.StartCuda("WriteEllpackPages");
|
||||
WriteEllpackPages(dmat, cache_info);
|
||||
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
|
||||
monitor_.StopCuda("WriteEllpackPages");
|
||||
|
||||
source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
|
||||
ParseCacheInfo(cache_info_, kPageType_)));
|
||||
}
|
||||
|
||||
void EllpackPageSourceImpl::BeforeFirst() {
|
||||
source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
|
||||
ParseCacheInfo(cache_info_, kPageType_)));
|
||||
source_->BeforeFirst();
|
||||
}
|
||||
|
||||
bool EllpackPageSourceImpl::Next() {
|
||||
return source_->Next();
|
||||
}
|
||||
|
||||
EllpackPage& EllpackPageSourceImpl::Value() {
|
||||
EllpackPage& page = source_->Value();
|
||||
page.Impl()->InitDevice(device_, ellpack_info_);
|
||||
return page;
|
||||
}
|
||||
|
||||
const EllpackPage& EllpackPageSourceImpl::Value() const {
|
||||
EllpackPage& page = source_->Value();
|
||||
page.Impl()->InitDevice(device_, ellpack_info_);
|
||||
return page;
|
||||
external_prefetcher_.reset(
|
||||
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
|
||||
}
|
||||
|
||||
// Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
|
||||
void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const {
|
||||
void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
|
||||
const common::HistogramCuts& cuts,
|
||||
const std::string& cache_info,
|
||||
size_t row_stride) const {
|
||||
auto cinfo = ParseCacheInfo(cache_info, kPageType_);
|
||||
const size_t extra_buffer_capacity = 6;
|
||||
SparsePageWriter<EllpackPage> writer(
|
||||
cinfo.name_shards, cinfo.format_shards, extra_buffer_capacity);
|
||||
SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
|
||||
extra_buffer_capacity);
|
||||
std::shared_ptr<EllpackPage> page;
|
||||
SparsePage temp_host_page;
|
||||
writer.Alloc(&page);
|
||||
auto* impl = page->Impl();
|
||||
impl->matrix.info = ellpack_info_;
|
||||
impl->Clear();
|
||||
|
||||
const MetaInfo& info = dmat->Info();
|
||||
size_t bytes_write = 0;
|
||||
double tstart = dmlc::GetTime();
|
||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||
impl->Push(device_, batch);
|
||||
temp_host_page.Push(batch);
|
||||
|
||||
size_t mem_cost_bytes = impl->MemCostBytes();
|
||||
size_t mem_cost_bytes =
|
||||
EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
|
||||
if (mem_cost_bytes >= page_size_) {
|
||||
bytes_write += mem_cost_bytes;
|
||||
impl->CompressSparsePage(device_);
|
||||
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
|
||||
row_stride);
|
||||
writer.PushWrite(std::move(page));
|
||||
writer.Alloc(&page);
|
||||
impl = page->Impl();
|
||||
impl->matrix.info = ellpack_info_;
|
||||
impl->Clear();
|
||||
temp_host_page.Clear();
|
||||
double tdiff = dmlc::GetTime() - tstart;
|
||||
LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
|
||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " written";
|
||||
}
|
||||
}
|
||||
if (impl->Size() != 0) {
|
||||
impl->CompressSparsePage(device_);
|
||||
if (temp_host_page.Size() != 0) {
|
||||
*impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
|
||||
row_stride);
|
||||
writer.PushWrite(std::move(page));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,19 +10,17 @@
|
||||
#include <string>
|
||||
|
||||
#include "../common/timer.h"
|
||||
#include "../common/hist_util.h"
|
||||
#include "sparse_page_source.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
class EllpackPageSourceImpl;
|
||||
|
||||
/*!
|
||||
* \brief External memory data source for ELLPACK format.
|
||||
*
|
||||
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
|
||||
* including CUDA-specific implementation details in the header.
|
||||
*/
|
||||
class EllpackPageSource : public DataSource<EllpackPage> {
|
||||
class EllpackPageSource {
|
||||
public:
|
||||
/*!
|
||||
* \brief Create source from cache files the cache_prefix.
|
||||
@@ -32,19 +30,33 @@ class EllpackPageSource : public DataSource<EllpackPage> {
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false);
|
||||
|
||||
/*! \brief destructor */
|
||||
~EllpackPageSource() override = default;
|
||||
BatchSet<EllpackPage> GetBatchSet() {
|
||||
auto begin_iter = BatchIterator<EllpackPage>(
|
||||
new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>,
|
||||
EllpackPage>(external_prefetcher_.get()));
|
||||
return BatchSet<EllpackPage>(begin_iter);
|
||||
}
|
||||
|
||||
void BeforeFirst() override;
|
||||
bool Next() override;
|
||||
EllpackPage& Value();
|
||||
const EllpackPage& Value() const override;
|
||||
|
||||
const EllpackPageSourceImpl* Impl() const { return impl_.get(); }
|
||||
EllpackPageSourceImpl* Impl() { return impl_.get(); }
|
||||
~EllpackPageSource() {
|
||||
external_prefetcher_.reset();
|
||||
for (auto file : cache_info_.name_shards) {
|
||||
TryDeleteCacheFile(file);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<EllpackPageSourceImpl> impl_;
|
||||
void WriteEllpackPages(int device, DMatrix* dmat,
|
||||
const common::HistogramCuts& cuts,
|
||||
const std::string& cache_info,
|
||||
size_t row_stride) const;
|
||||
|
||||
/*! \brief The page type string for ELLPACK. */
|
||||
const std::string kPageType_{".ellpack.page"};
|
||||
|
||||
size_t page_size_{DMatrix::kPageSize};
|
||||
common::Monitor monitor_;
|
||||
std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
|
||||
CacheInfo cache_info_;
|
||||
};
|
||||
|
||||
} // namespace data
|
||||
|
||||
@@ -51,11 +51,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
|
||||
ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
|
||||
batch_param_ = param;
|
||||
}
|
||||
ellpack_source_->BeforeFirst();
|
||||
ellpack_source_->Next();
|
||||
auto begin_iter = BatchIterator<EllpackPage>(
|
||||
new SparseBatchIteratorImpl<EllpackPageSource, EllpackPage>(ellpack_source_.get()));
|
||||
return BatchSet<EllpackPage>(begin_iter);
|
||||
return ellpack_source_->GetBatchSet();
|
||||
}
|
||||
|
||||
} // namespace data
|
||||
|
||||
Reference in New Issue
Block a user