Gradient based sampling for GPU Hist (#5093)

* Implement gradient based sampling for GPU Hist tree method. * Add samplers and handle compacted page in GPU Hist.
2020-02-03 18:31:27 -08:00
parent c74216f22c
commit e4b74c4d22
18 changed files with 1187 additions and 175 deletions
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -0,0 +1,380 @@
+/*!
+ * Copyright 2019 by XGBoost Contributors
+ */
+#include <thrust/functional.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <xgboost/host_device_vector.h>
+#include <xgboost/logging.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "../../common/compressed_iterator.h"
+#include "../../common/random.h"
+#include "gradient_based_sampler.cuh"
+
+namespace xgboost {
+namespace tree {
+
+/*! \brief A functor that returns random weights. */
+class RandomWeight : public thrust::unary_function<size_t, float> {
+ public:
+  explicit RandomWeight(size_t seed) : seed_(seed) {}
+
+  XGBOOST_DEVICE float operator()(size_t i) const {
+    thrust::default_random_engine rng(seed_);
+    thrust::uniform_real_distribution<float> dist;
+    rng.discard(i);
+    return dist(rng);
+  }
+
+ private:
+  uint32_t seed_;
+};
+
+/*! \brief A functor that performs a Bernoulli trial to discard a gradient pair. */
+class BernoulliTrial : public thrust::unary_function<size_t, bool> {
+ public:
+  BernoulliTrial(size_t seed, float p) : rnd_(seed), p_(p) {}
+
+  XGBOOST_DEVICE bool operator()(size_t i) const {
+    return rnd_(i) > p_;
+  }
+
+ private:
+  RandomWeight rnd_;
+  float p_;
+};
+
+/*! \brief A functor that returns true if the gradient pair is non-zero. */
+struct IsNonZero : public thrust::unary_function<GradientPair, bool> {
+  XGBOOST_DEVICE bool operator()(const GradientPair& gpair) const {
+    return gpair.GetGrad() != 0 || gpair.GetHess() != 0;
+  }
+};
+
+/*! \brief A functor that clears the row indexes with empty gradient. */
+struct ClearEmptyRows : public thrust::binary_function<GradientPair, size_t, size_t> {
+  XGBOOST_DEVICE size_t operator()(const GradientPair& gpair, size_t row_index) const {
+    if (gpair.GetGrad() != 0 || gpair.GetHess() != 0) {
+      return row_index;
+    } else {
+      return std::numeric_limits<std::size_t>::max();
+    }
+  }
+};
+
+/*! \brief A functor that combines the gradient pair into a single float.
+ *
+ * The approach here is based on Minimal Variance Sampling (MVS), with lambda set to 0.1.
+ *
+ * \see Ibragimov, B., & Gusev, G. (2019). Minimal Variance Sampling in Stochastic Gradient
+ * Boosting. In Advances in Neural Information Processing Systems (pp. 15061-15071).
+ */
+class CombineGradientPair : public thrust::unary_function<GradientPair, float> {
+ public:
+  XGBOOST_DEVICE float operator()(const GradientPair& gpair) const {
+    return sqrtf(powf(gpair.GetGrad(), 2) + kLambda * powf(gpair.GetHess(), 2));
+  }
+
+ private:
+  static constexpr float kLambda = 0.1f;
+};
+
+/*! \brief A functor that calculates the difference between the sample rate and the desired sample
+ * rows, given a cumulative gradient sum.
+ */
+class SampleRateDelta : public thrust::binary_function<float, size_t, float> {
+ public:
+  SampleRateDelta(common::Span<float> threshold, size_t n_rows, size_t sample_rows)
+      : threshold_(threshold), n_rows_(n_rows), sample_rows_(sample_rows) {}
+
+  XGBOOST_DEVICE float operator()(float gradient_sum, size_t row_index) const {
+    float lower = threshold_[row_index];
+    float upper = threshold_[row_index + 1];
+    float u = gradient_sum / static_cast<float>(sample_rows_ - n_rows_ + row_index + 1);
+    if (u > lower && u <= upper) {
+      threshold_[row_index + 1] = u;
+      return 0.0f;
+    } else {
+      return std::numeric_limits<float>::max();
+    }
+  }
+
+ private:
+  common::Span<float> threshold_;
+  size_t n_rows_;
+  size_t sample_rows_;
+};
+
+/*! \brief A functor that performs Poisson sampling, and scales gradient pairs by 1/p_i. */
+class PoissonSampling : public thrust::binary_function<GradientPair, size_t, GradientPair> {
+ public:
+  PoissonSampling(common::Span<float> threshold, size_t threshold_index, RandomWeight rnd)
+      : threshold_(threshold), threshold_index_(threshold_index), rnd_(rnd) {}
+
+  XGBOOST_DEVICE GradientPair operator()(const GradientPair& gpair, size_t i) {
+    // If the gradient and hessian are both empty, we should never select this row.
+    if (gpair.GetGrad() == 0 && gpair.GetHess() == 0) {
+      return gpair;
+    }
+    float combined_gradient = combine_(gpair);
+    float u = threshold_[threshold_index_];
+    float p = combined_gradient / u;
+    if (p >= 1) {
+      // Always select this row.
+      return gpair;
+    } else {
+      // Select this row randomly with probability proportional to the combined gradient.
+      // Scale gpair by 1/p.
+      if (rnd_(i) <= p) {
+        return gpair / p;
+      } else {
+        return GradientPair();
+      }
+    }
+  }
+
+ private:
+  common::Span<float> threshold_;
+  size_t threshold_index_;
+  RandomWeight rnd_;
+  CombineGradientPair combine_;
+};
+
+NoSampling::NoSampling(EllpackPageImpl* page) : page_(page) {}
+
+GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
+  return {dmat->Info().num_row_, page_, gpair};
+}
+
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
+                                                   size_t n_rows,
+                                                   const BatchParam& batch_param)
+    : batch_param_(batch_param),
+      page_(new EllpackPageImpl(batch_param.gpu_id, page->matrix.info, n_rows)) {}
+
+GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
+                                                     DMatrix* dmat) {
+  if (!page_concatenated_) {
+    // Concatenate all the external memory ELLPACK pages into a single in-memory page.
+    size_t offset = 0;
+    for (auto& batch : dmat->GetBatches<EllpackPage>(batch_param_)) {
+      auto page = batch.Impl();
+      size_t num_elements = page_->Copy(batch_param_.gpu_id, page, offset);
+      offset += num_elements;
+    }
+    page_concatenated_ = true;
+  }
+  return {dmat->Info().num_row_, page_.get(), gpair};
+}
+
+UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample)
+    : page_(page), subsample_(subsample) {}
+
+GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
+  // Set gradient pair to 0 with p = 1 - subsample
+  thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
+                     thrust::counting_iterator<size_t>(0),
+                     BernoulliTrial(common::GlobalRandom()(), subsample_),
+                     GradientPair());
+  return {dmat->Info().num_row_, page_, gpair};
+}
+
+ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page,
+                                                             size_t n_rows,
+                                                             const BatchParam& batch_param,
+                                                             float subsample)
+    : original_page_(page), batch_param_(batch_param), subsample_(subsample) {
+  ba_.Allocate(batch_param_.gpu_id, &sample_row_index_, n_rows);
+}
+
+GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientPair> gpair,
+                                                          DMatrix* dmat) {
+  // Set gradient pair to 0 with p = 1 - subsample
+  thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
+                     thrust::counting_iterator<size_t>(0),
+                     BernoulliTrial(common::GlobalRandom()(), subsample_),
+                     GradientPair());
+
+  // Count the sampled rows.
+  size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
+  size_t n_rows = dmat->Info().num_row_;
+
+  // Compact gradient pairs.
+  gpair_.resize(sample_rows);
+  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+
+  // Index the sample rows.
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(sample_row_index_), IsNonZero());
+  thrust::exclusive_scan(dh::tbegin(sample_row_index_), dh::tend(sample_row_index_),
+                         dh::tbegin(sample_row_index_));
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    dh::tbegin(sample_row_index_),
+                    dh::tbegin(sample_row_index_),
+                    ClearEmptyRows());
+
+  // Create a new ELLPACK page with empty rows.
+  page_.reset();  // Release the device memory first before reallocating
+  page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
+                                  original_page_->matrix.info,
+                                  sample_rows));
+
+  // Compact the ELLPACK pages into the single sample page.
+  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  for (auto& batch : dmat->GetBatches<EllpackPage>(batch_param_)) {
+    page_->Compact(batch_param_.gpu_id, batch.Impl(), sample_row_index_);
+  }
+
+  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
+}
+
+GradientBasedSampling::GradientBasedSampling(EllpackPageImpl* page,
+                                             size_t n_rows,
+                                             const BatchParam& batch_param,
+                                             float subsample) : page_(page), subsample_(subsample) {
+  ba_.Allocate(batch_param.gpu_id,
+               &threshold_, n_rows + 1,
+               &grad_sum_, n_rows);
+}
+
+GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpair,
+                                                  DMatrix* dmat) {
+  size_t n_rows = dmat->Info().num_row_;
+  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
+      gpair, threshold_, grad_sum_, n_rows * subsample_);
+
+  // Perform Poisson sampling in place.
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0),
+                    dh::tbegin(gpair),
+                    PoissonSampling(threshold_,
+                                    threshold_index,
+                                    RandomWeight(common::GlobalRandom()())));
+  return {n_rows, page_, gpair};
+}
+
+ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
+    EllpackPageImpl* page,
+    size_t n_rows,
+    const BatchParam& batch_param,
+    float subsample) : original_page_(page), batch_param_(batch_param), subsample_(subsample) {
+  ba_.Allocate(batch_param.gpu_id,
+               &threshold_, n_rows + 1,
+               &grad_sum_, n_rows,
+               &sample_row_index_, n_rows);
+}
+
+GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<GradientPair> gpair,
+                                                                DMatrix* dmat) {
+  size_t n_rows = dmat->Info().num_row_;
+  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
+      gpair, threshold_, grad_sum_, n_rows * subsample_);
+
+  // Perform Poisson sampling in place.
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0),
+                    dh::tbegin(gpair),
+                    PoissonSampling(threshold_,
+                                    threshold_index,
+                                    RandomWeight(common::GlobalRandom()())));
+
+  // Count the sampled rows.
+  size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
+
+  // Compact gradient pairs.
+  gpair_.resize(sample_rows);
+  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+
+  // Index the sample rows.
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(sample_row_index_), IsNonZero());
+  thrust::exclusive_scan(dh::tbegin(sample_row_index_), dh::tend(sample_row_index_),
+                         dh::tbegin(sample_row_index_));
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    dh::tbegin(sample_row_index_),
+                    dh::tbegin(sample_row_index_),
+                    ClearEmptyRows());
+
+  // Create a new ELLPACK page with empty rows.
+  page_.reset();  // Release the device memory first before reallocating
+  page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
+                                  original_page_->matrix.info,
+                                  sample_rows));
+
+  // Compact the ELLPACK pages into the single sample page.
+  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  for (auto& batch : dmat->GetBatches<EllpackPage>(batch_param_)) {
+    page_->Compact(batch_param_.gpu_id, batch.Impl(), sample_row_index_);
+  }
+
+  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
+}
+
+GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
+                                           size_t n_rows,
+                                           const BatchParam& batch_param,
+                                           float subsample,
+                                           int sampling_method) {
+  monitor_.Init("gradient_based_sampler");
+
+  bool is_sampling = subsample < 1.0;
+  bool is_external_memory = page->matrix.n_rows != n_rows;
+
+  if (is_sampling) {
+    switch (sampling_method) {
+      case TrainParam::kUniform:
+        if (is_external_memory) {
+          strategy_.reset(new ExternalMemoryUniformSampling(page, n_rows, batch_param, subsample));
+        } else {
+          strategy_.reset(new UniformSampling(page, subsample));
+        }
+        break;
+      case TrainParam::kGradientBased:
+        if (is_external_memory) {
+          strategy_.reset(
+              new ExternalMemoryGradientBasedSampling(page, n_rows, batch_param, subsample));
+        } else {
+          strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
+        }
+        break;
+      default:LOG(FATAL) << "unknown sampling method";
+    }
+  } else {
+    if (is_external_memory) {
+      strategy_.reset(new ExternalMemoryNoSampling(page, n_rows, batch_param));
+    } else {
+      strategy_.reset(new NoSampling(page));
+    }
+  }
+}
+
+// Sample a DMatrix based on the given gradient pairs.
+GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
+                                                 DMatrix* dmat) {
+  monitor_.StartCuda("Sample");
+  GradientBasedSample sample = strategy_->Sample(gpair, dmat);
+  monitor_.StopCuda("Sample");
+  return sample;
+}
+
+size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
+                                                     common::Span<float> threshold,
+                                                     common::Span<float> grad_sum,
+                                                     size_t sample_rows) {
+  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    dh::tbegin(threshold),
+                    CombineGradientPair());
+  thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
+  thrust::inclusive_scan(dh::tbegin(threshold), dh::tend(threshold) - 1, dh::tbegin(grad_sum));
+  thrust::transform(dh::tbegin(grad_sum), dh::tend(grad_sum),
+                    thrust::counting_iterator<size_t>(0),
+                    dh::tbegin(grad_sum),
+                    SampleRateDelta(threshold, gpair.size(), sample_rows));
+  thrust::device_ptr<float> min = thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
+  return thrust::distance(dh::tbegin(grad_sum), min) + 1;
+}
+
+};  // namespace tree
+};  // namespace xgboost
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -0,0 +1,153 @@
+/*!
+ * Copyright 2019 by XGBoost Contributors
+ */
+#pragma once
+#include <xgboost/base.h>
+#include <xgboost/data.h>
+#include <xgboost/span.h>
+
+#include "../../common/device_helpers.cuh"
+#include "../../data/ellpack_page.cuh"
+
+namespace xgboost {
+namespace tree {
+
+struct GradientBasedSample {
+  /*!\brief Number of sampled rows. */
+  size_t sample_rows;
+  /*!\brief Sampled rows in ELLPACK format. */
+  EllpackPageImpl* page;
+  /*!\brief Gradient pairs for the sampled rows. */
+  common::Span<GradientPair> gpair;
+};
+
+class SamplingStrategy {
+ public:
+  /*! \brief Sample from a DMatrix based on the given gradient pairs. */
+  virtual GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) = 0;
+};
+
+/*! \brief No sampling in in-memory mode. */
+class NoSampling : public SamplingStrategy {
+ public:
+  explicit NoSampling(EllpackPageImpl* page);
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+
+ private:
+  EllpackPageImpl* page_;
+};
+
+/*! \brief No sampling in external memory mode. */
+class ExternalMemoryNoSampling : public SamplingStrategy {
+ public:
+  ExternalMemoryNoSampling(EllpackPageImpl* page,
+                           size_t n_rows,
+                           const BatchParam& batch_param);
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+
+ private:
+  BatchParam batch_param_;
+  std::unique_ptr<EllpackPageImpl> page_;
+  bool page_concatenated_{false};
+};
+
+/*! \brief Uniform sampling in in-memory mode. */
+class UniformSampling : public SamplingStrategy {
+ public:
+  UniformSampling(EllpackPageImpl* page, float subsample);
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+
+ private:
+  EllpackPageImpl* page_;
+  float subsample_;
+};
+
+/*! \brief No sampling in external memory mode. */
+class ExternalMemoryUniformSampling : public SamplingStrategy {
+ public:
+  ExternalMemoryUniformSampling(EllpackPageImpl* page,
+                                size_t n_rows,
+                                const BatchParam& batch_param,
+                                float subsample);
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+
+ private:
+  dh::BulkAllocator ba_;
+  EllpackPageImpl* original_page_;
+  BatchParam batch_param_;
+  float subsample_;
+  std::unique_ptr<EllpackPageImpl> page_;
+  dh::device_vector<GradientPair> gpair_{};
+  common::Span<size_t> sample_row_index_;
+};
+
+/*! \brief Gradient-based sampling in in-memory mode.. */
+class GradientBasedSampling : public SamplingStrategy {
+ public:
+  GradientBasedSampling(EllpackPageImpl* page,
+                        size_t n_rows,
+                        const BatchParam& batch_param,
+                        float subsample);
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+
+ private:
+  EllpackPageImpl* page_;
+  float subsample_;
+  dh::BulkAllocator ba_;
+  common::Span<float> threshold_;
+  common::Span<float> grad_sum_;
+};
+
+/*! \brief Gradient-based sampling in external memory mode.. */
+class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
+ public:
+  ExternalMemoryGradientBasedSampling(EllpackPageImpl* page,
+                                      size_t n_rows,
+                                      const BatchParam& batch_param,
+                                      float subsample);
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+
+ private:
+  dh::BulkAllocator ba_;
+  EllpackPageImpl* original_page_;
+  BatchParam batch_param_;
+  float subsample_;
+  common::Span<float> threshold_;
+  common::Span<float> grad_sum_;
+  std::unique_ptr<EllpackPageImpl> page_;
+  dh::device_vector<GradientPair> gpair_;
+  common::Span<size_t> sample_row_index_;
+};
+
+/*! \brief Draw a sample of rows from a DMatrix.
+ *
+ * \see Ke, G., Meng, Q., Finley, T., Wang, T., Chen, W., Ma, W., ... & Liu, T. Y. (2017).
+ * Lightgbm: A highly efficient gradient boosting decision tree. In Advances in Neural Information
+ * Processing Systems (pp. 3146-3154).
+ * \see Zhu, R. (2016). Gradient-based sampling: An adaptive importance sampling for least-squares.
+ * In Advances in Neural Information Processing Systems (pp. 406-414).
+ * \see Ohlsson, E. (1998). Sequential poisson sampling. Journal of official Statistics, 14(2), 149.
+ */
+class GradientBasedSampler {
+ public:
+  GradientBasedSampler(EllpackPageImpl* page,
+                       size_t n_rows,
+                       const BatchParam& batch_param,
+                       float subsample,
+                       int sampling_method);
+
+  /*! \brief Sample from a DMatrix based on the given gradient pairs. */
+  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat);
+
+  /*! \brief Calculate the threshold used to normalize sampling probabilities. */
+  static size_t CalculateThresholdIndex(common::Span<GradientPair> gpair,
+                                        common::Span<float> threshold,
+                                        common::Span<float> grad_sum,
+                                        size_t sample_rows);
+
+ private:
+  common::Monitor monitor_;
+  std::unique_ptr<SamplingStrategy> strategy_;
+};
+};  // namespace tree
+};  // namespace xgboost
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -125,7 +125,6 @@ class RowPartitioner {
      idx += segment.begin;
      RowIndexT ridx = d_ridx[idx];
      bst_node_t new_position = op(ridx);  // new node id
-      if (new_position == kIgnoredTreePosition) return;
      KERNEL_CHECK(new_position == left_nidx || new_position == right_nidx);
      AtomicIncrement(d_left_count, new_position == left_nidx);
      d_position[idx] = new_position;