Support multiple batches in gpu_hist (#5014)
* Initial external memory training support for GPU Hist tree method.
This commit is contained in:
@@ -69,6 +69,8 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
|
||||
monitor_.Init("ellpack_page");
|
||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||
|
||||
matrix.n_rows = dmat->Info().num_row_;
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
|
||||
common::HistogramCuts hmat;
|
||||
@@ -206,7 +208,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
|
||||
// Return the number of rows contained in this page.
|
||||
size_t EllpackPageImpl::Size() const {
|
||||
return n_rows;
|
||||
return matrix.n_rows;
|
||||
}
|
||||
|
||||
// Clear the current page.
|
||||
@@ -214,44 +216,50 @@ void EllpackPageImpl::Clear() {
|
||||
ba_.Clear();
|
||||
gidx_buffer = {};
|
||||
idx_buffer.clear();
|
||||
n_rows = 0;
|
||||
sparse_page_.Clear();
|
||||
matrix.base_rowid = 0;
|
||||
matrix.n_rows = 0;
|
||||
device_initialized_ = false;
|
||||
}
|
||||
|
||||
// Push a CSR page to the current page.
|
||||
//
|
||||
// First compress the CSR page into ELLPACK, then the compressed buffer is copied to host and
|
||||
// appended to the existing host vector.
|
||||
// The CSR pages are accumulated in memory until they reach a certain size, then written out as
|
||||
// compressed ELLPACK.
|
||||
void EllpackPageImpl::Push(int device, const SparsePage& batch) {
|
||||
sparse_page_.Push(batch);
|
||||
matrix.n_rows += batch.Size();
|
||||
}
|
||||
|
||||
// Compress the accumulated SparsePage.
|
||||
void EllpackPageImpl::CompressSparsePage(int device) {
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
InitCompressedData(device, batch.Size());
|
||||
InitCompressedData(device, matrix.n_rows);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
|
||||
monitor_.StartCuda("BinningCompression");
|
||||
DeviceHistogramBuilderState hist_builder_row_state(batch.Size());
|
||||
hist_builder_row_state.BeginBatch(batch);
|
||||
CreateHistIndices(device, batch, hist_builder_row_state.GetRowStateOnDevice());
|
||||
DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
|
||||
hist_builder_row_state.BeginBatch(sparse_page_);
|
||||
CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
|
||||
hist_builder_row_state.EndBatch();
|
||||
monitor_.StopCuda("BinningCompression");
|
||||
|
||||
monitor_.StartCuda("CopyDeviceToHost");
|
||||
std::vector<common::CompressedByteT> buffer(gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&buffer, gidx_buffer);
|
||||
int offset = 0;
|
||||
if (!idx_buffer.empty()) {
|
||||
offset = ::xgboost::common::detail::kPadding;
|
||||
}
|
||||
idx_buffer.reserve(idx_buffer.size() + buffer.size() - offset);
|
||||
idx_buffer.insert(idx_buffer.end(), buffer.begin() + offset, buffer.end());
|
||||
idx_buffer.resize(gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
|
||||
ba_.Clear();
|
||||
gidx_buffer = {};
|
||||
monitor_.StopCuda("CopyDeviceToHost");
|
||||
|
||||
n_rows += batch.Size();
|
||||
}
|
||||
|
||||
// Return the memory cost for storing the compressed features.
|
||||
size_t EllpackPageImpl::MemCostBytes() const {
|
||||
return idx_buffer.size() * sizeof(common::CompressedByteT);
|
||||
size_t num_symbols = matrix.info.n_bins + 1;
|
||||
|
||||
// Required buffer size for storing data matrix in ELLPack format.
|
||||
size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
|
||||
matrix.info.row_stride * matrix.n_rows, num_symbols);
|
||||
return compressed_size_bytes;
|
||||
}
|
||||
|
||||
// Copy the compressed features to GPU.
|
||||
|
||||
@@ -78,13 +78,14 @@ struct EllpackInfo {
|
||||
* kernels.*/
|
||||
struct EllpackMatrix {
|
||||
EllpackInfo info;
|
||||
size_t base_rowid{};
|
||||
size_t n_rows{};
|
||||
common::CompressedIterator<uint32_t> gidx_iter;
|
||||
|
||||
XGBOOST_DEVICE size_t BinCount() const { return info.gidx_fvalue_map.size(); }
|
||||
|
||||
// Get a matrix element, uses binary search for look up Return NaN if missing
|
||||
// Given a row index and a feature index, returns the corresponding cut value
|
||||
__device__ bst_float GetElement(size_t ridx, size_t fidx) const {
|
||||
ridx -= base_rowid;
|
||||
auto row_begin = info.row_stride * ridx;
|
||||
auto row_end = row_begin + info.row_stride;
|
||||
auto gidx = -1;
|
||||
@@ -102,6 +103,11 @@ struct EllpackMatrix {
|
||||
}
|
||||
return info.gidx_fvalue_map[gidx];
|
||||
}
|
||||
|
||||
// Check if the row id is withing range of the current batch.
|
||||
__device__ bool IsInRange(size_t row_id) const {
|
||||
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
||||
}
|
||||
};
|
||||
|
||||
// Instances of this type are created while creating the histogram bins for the
|
||||
@@ -185,7 +191,6 @@ class EllpackPageImpl {
|
||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||
common::Span<common::CompressedByteT> gidx_buffer;
|
||||
std::vector<common::CompressedByteT> idx_buffer;
|
||||
size_t n_rows{};
|
||||
|
||||
/*!
|
||||
* \brief Default constructor.
|
||||
@@ -240,7 +245,7 @@ class EllpackPageImpl {
|
||||
|
||||
/*! \brief Set the base row id for this page. */
|
||||
inline void SetBaseRowId(size_t row_id) {
|
||||
base_rowid_ = row_id;
|
||||
matrix.base_rowid = row_id;
|
||||
}
|
||||
|
||||
/*! \brief clear the page. */
|
||||
@@ -263,11 +268,17 @@ class EllpackPageImpl {
|
||||
*/
|
||||
void InitDevice(int device, EllpackInfo info);
|
||||
|
||||
/*! \brief Compress the accumulated SparsePage into ELLPACK format.
|
||||
*
|
||||
* @param device The GPU device to use.
|
||||
*/
|
||||
void CompressSparsePage(int device);
|
||||
|
||||
private:
|
||||
common::Monitor monitor_;
|
||||
dh::BulkAllocator ba_;
|
||||
size_t base_rowid_{};
|
||||
bool device_initialized_{false};
|
||||
SparsePage sparse_page_{};
|
||||
};
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -17,7 +17,8 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
|
||||
public:
|
||||
bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
|
||||
auto* impl = page->Impl();
|
||||
if (!fi->Read(&impl->n_rows)) return false;
|
||||
impl->Clear();
|
||||
if (!fi->Read(&impl->matrix.n_rows)) return false;
|
||||
return fi->Read(&impl->idx_buffer);
|
||||
}
|
||||
|
||||
@@ -25,13 +26,14 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
|
||||
dmlc::SeekStream* fi,
|
||||
const std::vector<bst_uint>& sorted_index_set) override {
|
||||
auto* impl = page->Impl();
|
||||
if (!fi->Read(&impl->n_rows)) return false;
|
||||
impl->Clear();
|
||||
if (!fi->Read(&impl->matrix.n_rows)) return false;
|
||||
return fi->Read(&page->Impl()->idx_buffer);
|
||||
}
|
||||
|
||||
void Write(const EllpackPage& page, dmlc::Stream* fo) override {
|
||||
auto* impl = page.Impl();
|
||||
fo->Write(impl->n_rows);
|
||||
fo->Write(impl->matrix.n_rows);
|
||||
auto buffer = impl->idx_buffer;
|
||||
CHECK(!buffer.empty());
|
||||
fo->Write(buffer);
|
||||
|
||||
@@ -40,11 +40,13 @@ class EllpackPageSourceImpl : public DataSource<EllpackPage> {
|
||||
const std::string kPageType_{".ellpack.page"};
|
||||
|
||||
int device_{-1};
|
||||
size_t page_size_{DMatrix::kPageSize};
|
||||
common::Monitor monitor_;
|
||||
dh::BulkAllocator ba_;
|
||||
/*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
|
||||
EllpackInfo ellpack_info_;
|
||||
std::unique_ptr<SparsePageSource<EllpackPage>> source_;
|
||||
std::string cache_info_;
|
||||
};
|
||||
|
||||
EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||
@@ -72,8 +74,12 @@ const EllpackPage& EllpackPageSource::Value() const {
|
||||
// each CSR page, and write the accumulated ELLPACK pages to disk.
|
||||
EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
||||
const std::string& cache_info,
|
||||
const BatchParam& param) noexcept(false) {
|
||||
device_ = param.gpu_id;
|
||||
const BatchParam& param) noexcept(false)
|
||||
: device_(param.gpu_id), cache_info_(cache_info) {
|
||||
|
||||
if (param.gpu_page_size > 0) {
|
||||
page_size_ = param.gpu_page_size;
|
||||
}
|
||||
|
||||
monitor_.Init("ellpack_page_source");
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
@@ -92,10 +98,11 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
||||
WriteEllpackPages(dmat, cache_info);
|
||||
monitor_.StopCuda("WriteEllpackPages");
|
||||
|
||||
source_.reset(new SparsePageSource<EllpackPage>(cache_info, kPageType_));
|
||||
source_.reset(new SparsePageSource<EllpackPage>(cache_info_, kPageType_));
|
||||
}
|
||||
|
||||
void EllpackPageSourceImpl::BeforeFirst() {
|
||||
source_.reset(new SparsePageSource<EllpackPage>(cache_info_, kPageType_));
|
||||
source_->BeforeFirst();
|
||||
}
|
||||
|
||||
@@ -133,20 +140,23 @@ void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string&
|
||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||
impl->Push(device_, batch);
|
||||
|
||||
if (impl->MemCostBytes() >= DMatrix::kPageSize) {
|
||||
bytes_write += impl->MemCostBytes();
|
||||
size_t mem_cost_bytes = impl->MemCostBytes();
|
||||
if (mem_cost_bytes >= page_size_) {
|
||||
bytes_write += mem_cost_bytes;
|
||||
impl->CompressSparsePage(device_);
|
||||
writer.PushWrite(std::move(page));
|
||||
writer.Alloc(&page);
|
||||
impl = page->Impl();
|
||||
impl->matrix.info = ellpack_info_;
|
||||
impl->Clear();
|
||||
double tdiff = dmlc::GetTime() - tstart;
|
||||
LOG(INFO) << "Writing to " << cache_info << " in "
|
||||
LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
|
||||
<< ((bytes_write >> 20UL) / tdiff) << " MB/s, "
|
||||
<< (bytes_write >> 20UL) << " written";
|
||||
}
|
||||
}
|
||||
if (impl->Size() != 0) {
|
||||
impl->CompressSparsePage(device_);
|
||||
writer.PushWrite(std::move(page));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,10 +81,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
|
||||
CHECK_GE(param.gpu_id, 0);
|
||||
CHECK_GE(param.max_bin, 2);
|
||||
// Lazily instantiate
|
||||
if (!ellpack_source_ ||
|
||||
batch_param_.gpu_id != param.gpu_id ||
|
||||
batch_param_.max_bin != param.max_bin ||
|
||||
batch_param_.gpu_batch_nrows != param.gpu_batch_nrows) {
|
||||
if (!ellpack_source_ || batch_param_ != param) {
|
||||
ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
|
||||
batch_param_ = param;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user