Cleanup to prepare for using mmap pointer in external memory. (#9317)
- Update SparseDMatrix comment. - Use a pointer in the bitfield. We will replace the `std::vector<bool>` in `ColumnMatrix` with bitfield. - Clean up the page source. The timer is removed as it's inaccurate once we swap the mmap pointer into the page.
This commit is contained in:
@@ -590,7 +590,7 @@ class ArrayInterface {
|
||||
template <std::int32_t D, typename Fn>
|
||||
void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
|
||||
// Only used for cuDF at the moment.
|
||||
CHECK_EQ(array.valid.Size(), 0);
|
||||
CHECK_EQ(array.valid.Capacity(), 0);
|
||||
auto dispatch = [&](auto t) {
|
||||
using T = std::remove_const_t<decltype(t)> const;
|
||||
// Set the data size to max as we don't know the original size of a sliced array:
|
||||
|
||||
@@ -416,7 +416,8 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
|
||||
p_out->Reshape(array.shape);
|
||||
return;
|
||||
}
|
||||
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
|
||||
CHECK_EQ(array.valid.Capacity(), 0)
|
||||
<< "Meta info like label or weight can not have missing value.";
|
||||
if (array.is_contiguous && array.type == ToDType<T>::kType) {
|
||||
// Handle contigious
|
||||
p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
|
||||
|
||||
@@ -33,7 +33,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
||||
p_out->Reshape(array.shape);
|
||||
return;
|
||||
}
|
||||
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
|
||||
CHECK_EQ(array.valid.Capacity(), 0)
|
||||
<< "Meta info like label or weight can not have missing value.";
|
||||
auto ptr_device = SetDeviceToPtr(array.data);
|
||||
p_out->SetDevice(ptr_device);
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <thrust/iterator/transform_output_iterator.h>
|
||||
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/cuda_context.cuh"
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "../common/random.h"
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
@@ -313,7 +314,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
|
||||
auto d_csc_indptr = dh::ToSpan(csc_indptr);
|
||||
|
||||
auto bin_type = page.index.GetBinTypeSize();
|
||||
common::CompressedBufferWriter writer{page.cut.TotalBins() + 1}; // +1 for null value
|
||||
common::CompressedBufferWriter writer{page.cut.TotalBins() +
|
||||
static_cast<std::size_t>(1)}; // +1 for null value
|
||||
|
||||
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
|
||||
auto ridx = idx / row_stride;
|
||||
@@ -357,8 +359,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
|
||||
// copy gidx
|
||||
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr);
|
||||
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
|
||||
auto d_row_ptr = dh::ToSpan(row_ptr);
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
|
||||
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
|
||||
auto null = accessor.NullValue();
|
||||
|
||||
@@ -7,9 +7,6 @@
|
||||
#ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||
#define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
@@ -20,35 +17,33 @@
|
||||
#include "ellpack_page_source.h"
|
||||
#include "gradient_index_page_source.h"
|
||||
#include "sparse_page_source.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
namespace xgboost::data {
|
||||
/**
|
||||
* \brief DMatrix used for external memory.
|
||||
*
|
||||
* The external memory is created for controlling memory usage by splitting up data into
|
||||
* multiple batches. However that doesn't mean we will actually process exact 1 batch at
|
||||
* a time, which would be terribly slow considering that we have to loop through the
|
||||
* whole dataset for every tree split. So we use async pre-fetch and let caller to decide
|
||||
* how many batches it wants to process by returning data as shared pointer. The caller
|
||||
* can use async function to process the data or just stage those batches, making the
|
||||
* decision is out of the scope for sparse page dmatrix. These 2 optimizations might
|
||||
* defeat the purpose of splitting up dataset since if you load all the batches then the
|
||||
* memory usage is even worse than using a single batch. Essentially we need to control
|
||||
* how many batches can be in memory at the same time.
|
||||
* multiple batches. However that doesn't mean we will actually process exactly 1 batch
|
||||
* at a time, which would be terribly slow considering that we have to loop through the
|
||||
* whole dataset for every tree split. So we use async to pre-fetch pages and let the
|
||||
* caller to decide how many batches it wants to process by returning data as a shared
|
||||
* pointer. The caller can use async function to process the data or just stage those
|
||||
* batches based on its use cases. These two optimizations might defeat the purpose of
|
||||
* splitting up dataset since if you stage all the batches then the memory usage might be
|
||||
* even worse than using a single batch. As a result, we must control how many batches can
|
||||
* be in memory at any given time.
|
||||
*
|
||||
* Right now the write to the cache is sequential operation and is blocking, reading from
|
||||
* cache is async but with a hard coded limit of 4 pages as an heuristic. So by sparse
|
||||
* dmatrix itself there can be only 9 pages in main memory (might be of different types)
|
||||
* at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
|
||||
* dependent pages. If the caller stops iteration at the middle and start again, then the
|
||||
* number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
|
||||
* caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
|
||||
* sampling algo that samples only the first portion of data).
|
||||
* Right now the write to the cache is a sequential operation and is blocking. Reading
|
||||
* from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
|
||||
* heuristic. So by sparse dmatrix itself there can be only 7 pages in main memory (might
|
||||
* be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
|
||||
* pages, 3 pre-fetched dependent pages.
|
||||
*
|
||||
* Of course if the caller decides to retain some batches to perform parallel processing,
|
||||
* then we might load all pages in memory, which is also considered as a bug in caller's
|
||||
* code. So if the algo supports external memory, it must be careful that queue for async
|
||||
* code. So if the algo supports external memory, it must be careful that queue for async
|
||||
* call must have an upper limit.
|
||||
*
|
||||
* Another assumption we make is that the data must be immutable so caller should never
|
||||
@@ -101,7 +96,7 @@ class SparsePageDMatrix : public DMatrix {
|
||||
MetaInfo &Info() override;
|
||||
const MetaInfo &Info() const override;
|
||||
Context const *Ctx() const override { return &fmat_ctx_; }
|
||||
|
||||
// The only DMatrix implementation that returns false.
|
||||
bool SingleColBlock() const override { return false; }
|
||||
DMatrix *Slice(common::Span<int32_t const>) override {
|
||||
LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
|
||||
@@ -153,6 +148,5 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
|
||||
}
|
||||
return id;
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::data
|
||||
#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
|
||||
|
||||
@@ -6,39 +6,43 @@
|
||||
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
|
||||
|
||||
#include <algorithm> // for min
|
||||
#include <future> // async
|
||||
#include <future> // for async
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <utility> // for pair, move
|
||||
#include <vector>
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "../common/io.h" // for PrivateMmapStream, PadPageForMMAP
|
||||
#include "../common/io.h" // for PrivateMmapConstStream
|
||||
#include "../common/timer.h" // for Monitor, Timer
|
||||
#include "adapter.h"
|
||||
#include "dmlc/common.h" // OMPException
|
||||
#include "proxy_dmatrix.h"
|
||||
#include "sparse_page_writer.h"
|
||||
#include "dmlc/common.h" // for OMPException
|
||||
#include "proxy_dmatrix.h" // for DMatrixProxy
|
||||
#include "sparse_page_writer.h" // for SparsePageFormat
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
inline void TryDeleteCacheFile(const std::string& file) {
|
||||
if (std::remove(file.c_str()) != 0) {
|
||||
// Don't throw, this is called in a destructor.
|
||||
LOG(WARNING) << "Couldn't remove external memory cache file " << file
|
||||
<< "; you may want to remove it manually";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Information about the cache including path and page offsets.
|
||||
*/
|
||||
struct Cache {
|
||||
// whether the write to the cache is complete
|
||||
bool written;
|
||||
std::string name;
|
||||
std::string format;
|
||||
// offset into binary cache file.
|
||||
std::vector<size_t> offset;
|
||||
std::vector<std::uint64_t> offset;
|
||||
|
||||
Cache(bool w, std::string n, std::string fmt)
|
||||
: written{w}, name{std::move(n)}, format{std::move(fmt)} {
|
||||
@@ -50,14 +54,24 @@ struct Cache {
|
||||
return name + format;
|
||||
}
|
||||
|
||||
std::string ShardName() {
|
||||
[[nodiscard]] std::string ShardName() const {
|
||||
return ShardName(this->name, this->format);
|
||||
}
|
||||
void Push(std::size_t n_bytes) {
|
||||
offset.push_back(n_bytes);
|
||||
/**
|
||||
* @brief Record a page with size of n_bytes.
|
||||
*/
|
||||
void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
|
||||
/**
|
||||
* @brief Returns the view start and length for the i^th page.
|
||||
*/
|
||||
[[nodiscard]] auto View(std::size_t i) const {
|
||||
std::uint64_t off = offset.at(i);
|
||||
std::uint64_t len = offset.at(i + 1) - offset[i];
|
||||
return std::pair{off, len};
|
||||
}
|
||||
|
||||
// The write is completed.
|
||||
/**
|
||||
* @brief Call this once the write for the cache is complete.
|
||||
*/
|
||||
void Commit() {
|
||||
if (!written) {
|
||||
std::partial_sum(offset.begin(), offset.end(), offset.begin());
|
||||
@@ -66,7 +80,7 @@ struct Cache {
|
||||
}
|
||||
};
|
||||
|
||||
// Prevents multi-threaded call.
|
||||
// Prevents multi-threaded call to `GetBatches`.
|
||||
class TryLockGuard {
|
||||
std::mutex& lock_;
|
||||
|
||||
@@ -79,22 +93,25 @@ class TryLockGuard {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Base class for all page sources. Handles fetching, writing, and iteration.
|
||||
*/
|
||||
template <typename S>
|
||||
class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
protected:
|
||||
// Prevents calling this iterator from multiple places(or threads).
|
||||
std::mutex single_threaded_;
|
||||
|
||||
// The current page.
|
||||
std::shared_ptr<S> page_;
|
||||
|
||||
bool at_end_ {false};
|
||||
float missing_;
|
||||
int nthreads_;
|
||||
std::int32_t nthreads_;
|
||||
bst_feature_t n_features_;
|
||||
|
||||
uint32_t count_{0};
|
||||
|
||||
uint32_t n_batches_ {0};
|
||||
// Index to the current page.
|
||||
std::uint32_t count_{0};
|
||||
// Total number of batches.
|
||||
std::uint32_t n_batches_{0};
|
||||
|
||||
std::shared_ptr<Cache> cache_info_;
|
||||
|
||||
@@ -102,6 +119,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
// A ring storing futures to data. Since the DMatrix iterator is forward only, so we
|
||||
// can pre-fetch data in a ring.
|
||||
std::unique_ptr<Ring> ring_{new Ring};
|
||||
// Catching exception in pre-fetch threads to prevent segfault. Not always work though,
|
||||
// OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
|
||||
// OOM error should be rare.
|
||||
dmlc::OMPException exec_;
|
||||
common::Monitor monitor_;
|
||||
|
||||
@@ -123,7 +143,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
|
||||
exec_.Rethrow();
|
||||
|
||||
monitor_.Start("launch");
|
||||
for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
|
||||
fetch_it %= n_batches_; // ring
|
||||
if (ring_->at(fetch_it).valid()) {
|
||||
@@ -134,33 +153,25 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
|
||||
auto page = std::make_shared<S>();
|
||||
this->exec_.Run([&] {
|
||||
common::Timer timer;
|
||||
timer.Start();
|
||||
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
|
||||
auto n = self->cache_info_->ShardName();
|
||||
|
||||
std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
|
||||
std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
|
||||
|
||||
auto fi = std::make_unique<common::PrivateMmapConstStream>(n, offset, length);
|
||||
auto name = self->cache_info_->ShardName();
|
||||
auto [offset, length] = self->cache_info_->View(fetch_it);
|
||||
auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
|
||||
CHECK(fmt->Read(page.get(), fi.get()));
|
||||
timer.Stop();
|
||||
|
||||
LOG(INFO) << "Read a page `" << typeid(S).name() << "` in " << timer.ElapsedSeconds()
|
||||
<< " seconds.";
|
||||
});
|
||||
return page;
|
||||
});
|
||||
}
|
||||
monitor_.Stop("launch");
|
||||
|
||||
CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
|
||||
n_prefetch_batches)
|
||||
<< "Sparse DMatrix assumes forward iteration.";
|
||||
|
||||
monitor_.Start("Wait");
|
||||
page_ = (*ring_)[count_].get();
|
||||
monitor_.Stop("Wait");
|
||||
CHECK(!(*ring_)[count_].valid());
|
||||
monitor_.Stop("Wait");
|
||||
|
||||
exec_.Rethrow();
|
||||
|
||||
return true;
|
||||
@@ -183,6 +194,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
auto bytes = fmt->Write(*page_, fo.get());
|
||||
|
||||
timer.Stop();
|
||||
// Not entirely accurate, the kernels doesn't have to flush the data.
|
||||
LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
|
||||
<< timer.ElapsedSeconds() << " seconds.";
|
||||
cache_info_->Push(bytes);
|
||||
@@ -204,6 +216,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
|
||||
|
||||
~SparsePageSourceImpl() override {
|
||||
// Don't orphan the threads.
|
||||
for (auto& fu : *ring_) {
|
||||
if (fu.valid()) {
|
||||
fu.get();
|
||||
@@ -211,18 +224,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t Iter() const { return count_; }
|
||||
[[nodiscard]] uint32_t Iter() const { return count_; }
|
||||
|
||||
const S &operator*() const override {
|
||||
CHECK(page_);
|
||||
return *page_;
|
||||
}
|
||||
|
||||
std::shared_ptr<S const> Page() const override {
|
||||
[[nodiscard]] std::shared_ptr<S const> Page() const override {
|
||||
return page_;
|
||||
}
|
||||
|
||||
bool AtEnd() const override {
|
||||
[[nodiscard]] bool AtEnd() const override {
|
||||
return at_end_;
|
||||
}
|
||||
|
||||
@@ -230,20 +243,23 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
TryLockGuard guard{single_threaded_};
|
||||
at_end_ = false;
|
||||
count_ = 0;
|
||||
// Pre-fetch for the next round of iterations.
|
||||
this->Fetch();
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
// Push data from CUDA.
|
||||
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
|
||||
#else
|
||||
inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
|
||||
#endif
|
||||
|
||||
class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
|
||||
// This is the source from the user.
|
||||
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
|
||||
DMatrixProxy* proxy_;
|
||||
size_t base_row_id_ {0};
|
||||
std::size_t base_row_id_{0};
|
||||
|
||||
void Fetch() final {
|
||||
page_ = std::make_shared<SparsePage>();
|
||||
|
||||
Reference in New Issue
Block a user