Cleanup to prepare for using mmap pointer in external memory. (#9317)

- Update SparseDMatrix comment. - Use a pointer in the bitfield. We will replace the `std::vector<bool>` in `ColumnMatrix` with bitfield. - Clean up the page source. The timer is removed as it's inaccurate once we swap the mmap pointer into the page.
2023-06-22 06:43:11 +08:00
parent 4066d68261
commit 54da4b3185
18 changed files with 220 additions and 171 deletions
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -590,7 +590,7 @@ class ArrayInterface {
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
  // Only used for cuDF at the moment.
-  CHECK_EQ(array.valid.Size(), 0);
+  CHECK_EQ(array.valid.Capacity(), 0);
  auto dispatch = [&](auto t) {
    using T = std::remove_const_t<decltype(t)> const;
    // Set the data size to max as we don't know the original size of a sliced array:
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -416,7 +416,8 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  CHECK_EQ(array.valid.Capacity(), 0)
+      << "Meta info like label or weight can not have missing value.";
  if (array.is_contiguous && array.type == ToDType<T>::kType) {
    // Handle contigious
    p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -33,7 +33,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  CHECK_EQ(array.valid.Capacity(), 0)
+      << "Meta info like label or weight can not have missing value.";
  auto ptr_device = SetDeviceToPtr(array.data);
  p_out->SetDevice(ptr_device);

--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -5,6 +5,7 @@
 #include <thrust/iterator/transform_output_iterator.h>

 #include "../common/categorical.h"
+#include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
@@ -313,7 +314,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
  auto d_csc_indptr = dh::ToSpan(csc_indptr);

  auto bin_type = page.index.GetBinTypeSize();
-  common::CompressedBufferWriter writer{page.cut.TotalBins() + 1};  // +1 for null value
+  common::CompressedBufferWriter writer{page.cut.TotalBins() +
+                                        static_cast<std::size_t>(1)};  // +1 for null value

  dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
    auto ridx = idx / row_stride;
@@ -357,8 +359,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag

  // copy gidx
  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
-  dh::device_vector<size_t> row_ptr(page.row_ptr);
+  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
  auto d_row_ptr = dh::ToSpan(row_ptr);
+  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
+                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));

  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
  auto null = accessor.NullValue();
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -7,9 +7,6 @@
 #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_

-#include <xgboost/data.h>
-#include <xgboost/logging.h>
-
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -20,35 +17,33 @@
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
 #include "sparse_page_source.h"
+#include "xgboost/data.h"
+#include "xgboost/logging.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 /**
 * \brief DMatrix used for external memory.
 *
 * The external memory is created for controlling memory usage by splitting up data into
- * multiple batches.  However that doesn't mean we will actually process exact 1 batch at
- * a time, which would be terribly slow considering that we have to loop through the
- * whole dataset for every tree split.  So we use async pre-fetch and let caller to decide
- * how many batches it wants to process by returning data as shared pointer.  The caller
- * can use async function to process the data or just stage those batches, making the
- * decision is out of the scope for sparse page dmatrix.  These 2 optimizations might
- * defeat the purpose of splitting up dataset since if you load all the batches then the
- * memory usage is even worse than using a single batch.  Essentially we need to control
- * how many batches can be in memory at the same time.
+ * multiple batches.  However that doesn't mean we will actually process exactly 1 batch
+ * at a time, which would be terribly slow considering that we have to loop through the
+ * whole dataset for every tree split.  So we use async to pre-fetch pages and let the
+ * caller to decide how many batches it wants to process by returning data as a shared
+ * pointer. The caller can use async function to process the data or just stage those
+ * batches based on its use cases. These two optimizations might defeat the purpose of
+ * splitting up dataset since if you stage all the batches then the memory usage might be
+ * even worse than using a single batch. As a result, we must control how many batches can
+ * be in memory at any given time.
 *
- * Right now the write to the cache is sequential operation and is blocking, reading from
- * cache is async but with a hard coded limit of 4 pages as an heuristic.  So by sparse
- * dmatrix itself there can be only 9 pages in main memory (might be of different types)
- * at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
- * dependent pages.  If the caller stops iteration at the middle and start again, then the
- * number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
- * caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
- * sampling algo that samples only the first portion of data).
+ * Right now the write to the cache is a sequential operation and is blocking. Reading
+ * from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
+ * heuristic.  So by sparse dmatrix itself there can be only 7 pages in main memory (might
+ * be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
+ * pages, 3 pre-fetched dependent pages.
 *
 * Of course if the caller decides to retain some batches to perform parallel processing,
 * then we might load all pages in memory, which is also considered as a bug in caller's
- * code.  So if the algo supports external memory, it must be careful that queue for async
+ * code. So if the algo supports external memory, it must be careful that queue for async
 * call must have an upper limit.
 *
 * Another assumption we make is that the data must be immutable so caller should never
@@ -101,7 +96,7 @@ class SparsePageDMatrix : public DMatrix {
  MetaInfo &Info() override;
  const MetaInfo &Info() const override;
  Context const *Ctx() const override { return &fmat_ctx_; }
-
+  // The only DMatrix implementation that returns false.
  bool SingleColBlock() const override { return false; }
  DMatrix *Slice(common::Span<int32_t const>) override {
    LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
@@ -153,6 +148,5 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
  }
  return id;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -6,39 +6,43 @@
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

 #include <algorithm>  // for min
-#include <future>     // async
+#include <future>     // for async
 #include <map>
 #include <memory>
 #include <string>
 #include <thread>
-#include <utility>
+#include <utility>  // for pair, move
 #include <vector>

 #include "../common/common.h"
-#include "../common/io.h"     // for PrivateMmapStream, PadPageForMMAP
+#include "../common/io.h"     // for PrivateMmapConstStream
 #include "../common/timer.h"  // for Monitor, Timer
 #include "adapter.h"
-#include "dmlc/common.h"  // OMPException
-#include "proxy_dmatrix.h"
-#include "sparse_page_writer.h"
+#include "dmlc/common.h"         // for OMPException
+#include "proxy_dmatrix.h"       // for DMatrixProxy
+#include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
 #include "xgboost/data.h"

 namespace xgboost::data {
 inline void TryDeleteCacheFile(const std::string& file) {
  if (std::remove(file.c_str()) != 0) {
+    // Don't throw, this is called in a destructor.
    LOG(WARNING) << "Couldn't remove external memory cache file " << file
                 << "; you may want to remove it manually";
  }
 }

+/**
+ * @brief Information about the cache including path and page offsets.
+ */
 struct Cache {
  // whether the write to the cache is complete
  bool written;
  std::string name;
  std::string format;
  // offset into binary cache file.
-  std::vector<size_t> offset;
+  std::vector<std::uint64_t> offset;

  Cache(bool w, std::string n, std::string fmt)
      : written{w}, name{std::move(n)}, format{std::move(fmt)} {
@@ -50,14 +54,24 @@ struct Cache {
    return name + format;
  }

-  std::string ShardName() {
+  [[nodiscard]] std::string ShardName() const {
    return ShardName(this->name, this->format);
  }
-  void Push(std::size_t n_bytes) {
-    offset.push_back(n_bytes);
+  /**
+   * @brief Record a page with size of n_bytes.
+   */
+  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
+  /**
+   * @brief Returns the view start and length for the i^th page.
+   */
+  [[nodiscard]] auto View(std::size_t i) const {
+    std::uint64_t off = offset.at(i);
+    std::uint64_t len = offset.at(i + 1) - offset[i];
+    return std::pair{off, len};
  }
-
-  // The write is completed.
+  /**
+   * @brief Call this once the write for the cache is complete.
+   */
  void Commit() {
    if (!written) {
      std::partial_sum(offset.begin(), offset.end(), offset.begin());
@@ -66,7 +80,7 @@ struct Cache {
  }
 };

-// Prevents multi-threaded call.
+// Prevents multi-threaded call to `GetBatches`.
 class TryLockGuard {
  std::mutex& lock_;

@@ -79,22 +93,25 @@ class TryLockGuard {
  }
 };

+/**
+ * @brief Base class for all page sources. Handles fetching, writing, and iteration.
+ */
 template <typename S>
 class SparsePageSourceImpl : public BatchIteratorImpl<S> {
 protected:
  // Prevents calling this iterator from multiple places(or threads).
  std::mutex single_threaded_;
-
+  // The current page.
  std::shared_ptr<S> page_;

  bool at_end_ {false};
  float missing_;
-  int nthreads_;
+  std::int32_t nthreads_;
  bst_feature_t n_features_;
-
-  uint32_t count_{0};
-
-  uint32_t n_batches_ {0};
+  // Index to the current page.
+  std::uint32_t count_{0};
+  // Total number of batches.
+  std::uint32_t n_batches_{0};

  std::shared_ptr<Cache> cache_info_;

@@ -102,6 +119,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
  // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
  // can pre-fetch data in a ring.
  std::unique_ptr<Ring> ring_{new Ring};
+  // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
+  // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
+  // OOM error should be rare.
  dmlc::OMPException exec_;
  common::Monitor monitor_;

@@ -123,7 +143,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {

    exec_.Rethrow();

-    monitor_.Start("launch");
    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
      fetch_it %= n_batches_;  // ring
      if (ring_->at(fetch_it).valid()) {
@@ -134,33 +153,25 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
        auto page = std::make_shared<S>();
        this->exec_.Run([&] {
-          common::Timer timer;
-          timer.Start();
          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-          auto n = self->cache_info_->ShardName();
-
-          std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
-          std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
-
-          auto fi = std::make_unique<common::PrivateMmapConstStream>(n, offset, length);
+          auto name = self->cache_info_->ShardName();
+          auto [offset, length] = self->cache_info_->View(fetch_it);
+          auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
          CHECK(fmt->Read(page.get(), fi.get()));
-          timer.Stop();
-
-          LOG(INFO) << "Read a page `" << typeid(S).name() << "` in " << timer.ElapsedSeconds()
-                    << " seconds.";
        });
        return page;
      });
    }
-    monitor_.Stop("launch");

    CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
             n_prefetch_batches)
        << "Sparse DMatrix assumes forward iteration.";
+
    monitor_.Start("Wait");
    page_ = (*ring_)[count_].get();
-    monitor_.Stop("Wait");
    CHECK(!(*ring_)[count_].valid());
+    monitor_.Stop("Wait");
+
    exec_.Rethrow();

    return true;
@@ -183,6 +194,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    auto bytes = fmt->Write(*page_, fo.get());

    timer.Stop();
+    // Not entirely accurate, the kernels doesn't have to flush the data.
    LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
              << timer.ElapsedSeconds() << " seconds.";
    cache_info_->Push(bytes);
@@ -204,6 +216,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
  SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;

  ~SparsePageSourceImpl() override {
+    // Don't orphan the threads.
    for (auto& fu : *ring_) {
      if (fu.valid()) {
        fu.get();
@@ -211,18 +224,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    }
  }

-  uint32_t Iter() const { return count_; }
+  [[nodiscard]] uint32_t Iter() const { return count_; }

  const S &operator*() const override {
    CHECK(page_);
    return *page_;
  }

-  std::shared_ptr<S const> Page() const override {
+  [[nodiscard]] std::shared_ptr<S const> Page() const override {
    return page_;
  }

-  bool AtEnd() const override {
+  [[nodiscard]] bool AtEnd() const override {
    return at_end_;
  }

@@ -230,20 +243,23 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    TryLockGuard guard{single_threaded_};
    at_end_ = false;
    count_ = 0;
+    // Pre-fetch for the next round of iterations.
    this->Fetch();
  }
 };

 #if defined(XGBOOST_USE_CUDA)
+// Push data from CUDA.
 void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
 #else
 inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
 #endif

 class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
+  // This is the source from the user.
  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
  DMatrixProxy* proxy_;
-  size_t base_row_id_ {0};
+  std::size_t base_row_id_{0};

  void Fetch() final {
    page_ = std::make_shared<SparsePage>();