Use mmap for external memory. (#9282)

- Have basic infrastructure for mmap. - Release file write handle.
2023-06-19 18:52:55 +08:00
parent d8beb517ed
commit ee6809e642
16 changed files with 599 additions and 275 deletions
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -146,27 +146,30 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
  CombineGradientPair combine_;
 };

-NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
+NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}

-GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
+GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                       DMatrix* dmat) {
-  return {dmat->Info().num_row_, page_, gpair};
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+  return {dmat->Info().num_row_, page, gpair};
 }

-ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
-                                                   size_t n_rows, BatchParam batch_param)
-    : batch_param_{std::move(batch_param)},
-      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
-                                n_rows)) {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
+    : batch_param_{std::move(batch_param)} {}

 GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
                                                     common::Span<GradientPair> gpair,
                                                     DMatrix* dmat) {
  if (!page_concatenated_) {
    // Concatenate all the external memory ELLPACK pages into a single in-memory page.
+    page_.reset(nullptr);
    size_t offset = 0;
    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
      auto page = batch.Impl();
+      if (!page_) {
+        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+                                                  page->row_stride, dmat->Info().num_row_);
+      }
      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
      offset += num_elements;
    }
@@ -175,8 +178,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
  return {dmat->Info().num_row_, page_.get(), gpair};
 }

-UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
-    : page_(page), subsample_(subsample) {}
+UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
+    : batch_param_{std::move(batch_param)}, subsample_(subsample) {}

 GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                            DMatrix* dmat) {
@@ -185,7 +188,8 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<std::size_t>(0),
                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
-  return {dmat->Info().num_row_, page_, gpair};
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+  return {dmat->Info().num_row_, page, gpair};
 }

 ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
@@ -236,12 +240,10 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }

-GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
-                                             size_t n_rows,
-                                             const BatchParam&,
+GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
                                             float subsample)
-    : page_(page),
-      subsample_(subsample),
+    : subsample_(subsample),
+      batch_param_{std::move(batch_param)},
      threshold_(n_rows + 1, 0.0f),
      grad_sum_(n_rows, 0.0f) {}

@@ -252,18 +254,19 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+
  // Perform Poisson sampling in place.
  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                    RandomWeight(common::GlobalRandom()())));
-  return {n_rows, page_, gpair};
+  return {n_rows, page, gpair};
 }

-ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
-    size_t n_rows,
-    BatchParam batch_param,
-    float subsample)
+ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
+                                                                         BatchParam batch_param,
+                                                                         float subsample)
    : batch_param_(std::move(batch_param)),
      subsample_(subsample),
      threshold_(n_rows + 1, 0.0f),
@@ -273,16 +276,15 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
 GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
                                                                common::Span<GradientPair> gpair,
                                                                DMatrix* dmat) {
-  size_t n_rows = dmat->Info().num_row_;
+  auto cuctx = ctx->CUDACtx();
+  bst_row_t n_rows = dmat->Info().num_row_;
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

  // Perform Poisson sampling in place.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0),
-                    dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_),
-                                    threshold_index,
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
+                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                    RandomWeight(common::GlobalRandom()())));

  // Count the sampled rows.
@@ -290,16 +292,15 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c

  // Compact gradient pairs.
  gpair_.resize(sample_rows);
-  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+  thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());

  // Index the sample rows.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
-  thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
-    sample_row_index_.begin());
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    sample_row_index_.begin(),
-                    sample_row_index_.begin(),
-                    ClearEmptyRows());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    IsNonZero());
+  thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
+                         sample_row_index_.begin());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    sample_row_index_.begin(), ClearEmptyRows());

  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
  auto first_page = (*batch_iterator.begin()).Impl();
@@ -317,13 +318,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }

-GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
-                                           size_t n_rows, const BatchParam& batch_param,
-                                           float subsample, int sampling_method) {
+GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
+                                           const BatchParam& batch_param, float subsample,
+                                           int sampling_method, bool is_external_memory) {
+  // The ctx is kept here for future development of stream-based operations.
  monitor_.Init("gradient_based_sampler");

  bool is_sampling = subsample < 1.0;
-  bool is_external_memory = page->n_rows != n_rows;

  if (is_sampling) {
    switch (sampling_method) {
@@ -331,24 +332,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
        if (is_external_memory) {
          strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
        } else {
-          strategy_.reset(new UniformSampling(page, subsample));
+          strategy_.reset(new UniformSampling(batch_param, subsample));
        }
        break;
      case TrainParam::kGradientBased:
        if (is_external_memory) {
-          strategy_.reset(
-              new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
        } else {
-          strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
+          strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
        }
        break;
-      default:LOG(FATAL) << "unknown sampling method";
+      default:
+        LOG(FATAL) << "unknown sampling method";
    }
  } else {
    if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
    } else {
-      strategy_.reset(new NoSampling(page));
+      strategy_.reset(new NoSampling(batch_param));
    }
  }
 }
@@ -362,11 +363,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
  return sample;
 }

-size_t GradientBasedSampler::CalculateThresholdIndex(
-    common::Span<GradientPair> gpair, common::Span<float> threshold,
-    common::Span<float> grad_sum, size_t sample_rows) {
-  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
-               std::numeric_limits<float>::max());
+size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
+                                                     common::Span<float> threshold,
+                                                     common::Span<float> grad_sum,
+                                                     size_t sample_rows) {
+  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
                    CombineGradientPair());
  thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
@@ -379,6 +380,5 @@ size_t GradientBasedSampler::CalculateThresholdIndex(
      thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
  return thrust::distance(dh::tbegin(grad_sum), min) + 1;
 }
-
 };  // namespace tree
 };  // namespace xgboost
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #pragma once
 #include <xgboost/base.h>
@@ -32,37 +32,36 @@ class SamplingStrategy {
 /*! \brief No sampling in in-memory mode. */
 class NoSampling : public SamplingStrategy {
 public:
-  explicit NoSampling(EllpackPageImpl const* page);
-  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
-                             DMatrix* dmat) override;
-
- private:
-  EllpackPageImpl const* page_;
-};
-
-/*! \brief No sampling in external memory mode. */
-class ExternalMemoryNoSampling : public SamplingStrategy {
- public:
-  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                           BatchParam batch_param);
+  explicit NoSampling(BatchParam batch_param);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
  BatchParam batch_param_;
-  std::unique_ptr<EllpackPageImpl> page_;
+};
+
+/*! \brief No sampling in external memory mode. */
+class ExternalMemoryNoSampling : public SamplingStrategy {
+ public:
+  explicit ExternalMemoryNoSampling(BatchParam batch_param);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
+
+ private:
+  BatchParam batch_param_;
+  std::unique_ptr<EllpackPageImpl> page_{nullptr};
  bool page_concatenated_{false};
 };

 /*! \brief Uniform sampling in in-memory mode. */
 class UniformSampling : public SamplingStrategy {
 public:
-  UniformSampling(EllpackPageImpl const* page, float subsample);
+  UniformSampling(BatchParam batch_param, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  EllpackPageImpl const* page_;
+  BatchParam batch_param_;
  float subsample_;
 };

@@ -84,13 +83,12 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in in-memory mode.. */
 class GradientBasedSampling : public SamplingStrategy {
 public:
-  GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
-                        float subsample);
+  GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  EllpackPageImpl const* page_;
+  BatchParam batch_param_;
  float subsample_;
  dh::caching_device_vector<float> threshold_;
  dh::caching_device_vector<float> grad_sum_;
@@ -106,11 +104,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 private:
  BatchParam batch_param_;
  float subsample_;
-  dh::caching_device_vector<float> threshold_;
-  dh::caching_device_vector<float> grad_sum_;
+  dh::device_vector<float> threshold_;
+  dh::device_vector<float> grad_sum_;
  std::unique_ptr<EllpackPageImpl> page_;
  dh::device_vector<GradientPair> gpair_;
-  dh::caching_device_vector<size_t> sample_row_index_;
+  dh::device_vector<size_t> sample_row_index_;
 };

 /*! \brief Draw a sample of rows from a DMatrix.
@@ -124,8 +122,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 */
 class GradientBasedSampler {
 public:
-  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                       const BatchParam& batch_param, float subsample, int sampling_method);
+  GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
+                       float subsample, int sampling_method, bool is_external_memory);

  /*! \brief Sample from a DMatrix based on the given gradient pairs. */
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);