GPU memory usage fixes + column sampling refactor (#3635)

* Remove thrust copy calls * Fix histogram memory usage * Cap extreme histogram memory usage * More efficient column sampling * Use column sampler across updaters * More efficient split evaluation on GPU with column sampling
2018-08-27 16:26:46 +12:00
parent 60787ecebc
commit 686e990ffc
9 changed files with 198 additions and 182 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -402,7 +402,6 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
 void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
-                             const std::vector<bst_uint>& feat_set,
                             GHistRow hist) {
  data_.resize(nbins_ * nthread_, GHistEntry());
  std::fill(data_.begin(), data_.end(), GHistEntry());
@@ -461,7 +460,6 @@ void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
 void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
                                  const RowSetCollection::Elem row_indices,
                                  const GHistIndexBlockMatrix& gmatb,
-                                  const std::vector<bst_uint>& feat_set,
                                  GHistRow hist) {
  constexpr int kUnroll = 8;  // loop unrolling factor
  const size_t nblock = gmatb.GetNumBlock();
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -266,13 +266,11 @@ class GHistBuilder {
  void BuildHist(const std::vector<GradientPair>& gpair,
                 const RowSetCollection::Elem row_indices,
                 const GHistIndexMatrix& gmat,
-                 const std::vector<bst_uint>& feat_set,
                 GHistRow hist);
  // same, with feature grouping
  void BuildBlockHist(const std::vector<GradientPair>& gpair,
                      const RowSetCollection::Elem row_indices,
                      const GHistIndexBlockMatrix& gmatb,
-                      const std::vector<bst_uint>& feat_set,
                      GHistRow hist);
  // construct a histogram via subtraction trick
  void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -102,6 +102,7 @@ void HostDeviceVector<T>::Reshard(GPUSet devices) { }
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<unsigned int>;
+template class HostDeviceVector<int>;

 }  // namespace xgboost

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -77,7 +77,9 @@ struct HostDeviceVectorImpl {

    void LazySyncHost() {
      dh::safe_cuda(cudaSetDevice(device_));
-      thrust::copy(data_.begin(), data_.end(), vec_->data_h_.begin() + start_);
+      dh::safe_cuda(
+          cudaMemcpy(vec_->data_h_.data(), data_.data().get() + start_,
+                     data_.size() * sizeof(T), cudaMemcpyDeviceToHost));
      on_d_ = false;
    }

@@ -90,8 +92,9 @@ struct HostDeviceVectorImpl {
      size_t size_d = ShardSize(size_h, ndevices, index_);
      dh::safe_cuda(cudaSetDevice(device_));
      data_.resize(size_d);
-      thrust::copy(vec_->data_h_.begin() + start_,
-                   vec_->data_h_.begin() + start_ + size_d, data_.begin());
+      dh::safe_cuda(cudaMemcpy(data_.data().get(),
+                               vec_->data_h_.data() + start_,
+                               size_d * sizeof(T), cudaMemcpyHostToDevice));
      on_d_ = true;
      // this may cause a race condition if LazySyncDevice() is called
      // from multiple threads in parallel;
@@ -186,18 +189,22 @@ struct HostDeviceVectorImpl {
  void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
    CHECK_EQ(end - begin, Size());
    if (on_h_) {
-      thrust::copy(begin, end, data_h_.begin());
+      dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
+                               (end - begin) * sizeof(T),
+                               cudaMemcpyDeviceToHost));
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
-          shard.ScatterFrom(begin.get());
-        });
+        shard.ScatterFrom(begin.get());
+      });
    }
  }

  void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
    CHECK_EQ(end - begin, Size());
    if (on_h_) {
-      thrust::copy(data_h_.begin(), data_h_.end(), begin);
+      dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
+                               data_h_.size() * sizeof(T),
+                               cudaMemcpyHostToDevice));
    } else {
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
    }
@@ -400,5 +407,6 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<unsigned int>;
+template class HostDeviceVector<int>;

 }  // namespace xgboost
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -7,8 +7,14 @@
 #ifndef XGBOOST_COMMON_RANDOM_H_
 #define XGBOOST_COMMON_RANDOM_H_

-#include <random>
+#include <xgboost/logging.h>
+#include <algorithm>
+#include <vector>
 #include <limits>
+#include <map>
+#include <numeric>
+#include <random>
+#include "host_device_vector.h"

 namespace xgboost {
 namespace common {
@@ -66,6 +72,78 @@ using GlobalRandomEngine = RandomEngine;
 */
 GlobalRandomEngine& GlobalRandom(); // NOLINT(*)

+/**
+ * \class ColumnSampler
+ *
+ * \brief Handles selection of columns due to colsample_bytree and
+ * colsample_bylevel parameters. Should be initialised before tree
+ * construction and to reset when tree construction is completed.
+ */
+
+class ColumnSampler {
+  HostDeviceVector<int> feature_set_tree_;
+  std::map<int, HostDeviceVector<int>> feature_set_level_;
+  float colsample_bylevel_{1.0f};
+  float colsample_bytree_{1.0f};
+
+  std::vector<int> ColSample(std::vector<int> features, float colsample) const {
+    if (colsample == 1.0f) return features;
+    CHECK_GT(features.size(), 0);
+    int n = std::max(1, static_cast<int>(colsample * features.size()));
+
+    std::shuffle(features.begin(), features.end(), common::GlobalRandom());
+    features.resize(n);
+    std::sort(features.begin(), features.end());
+
+    return features;
+  }
+
+ public:
+  /**
+   * \brief Initialise this object before use.
+   *
+   * \param num_col
+   * \param colsample_bylevel
+   * \param colsample_bytree
+   * \param skip_index_0      (Optional) True to skip index 0.
+   */
+  void Init(int64_t num_col, float colsample_bylevel, float colsample_bytree,
+            bool skip_index_0 = false) {
+    this->colsample_bylevel_ = colsample_bylevel;
+    this->colsample_bytree_ = colsample_bytree;
+    this->Reset();
+
+    int begin_idx = skip_index_0 ? 1 : 0;
+    auto& feature_set_h = feature_set_tree_.HostVector();
+    feature_set_h.resize(num_col - begin_idx);
+
+    std::iota(feature_set_h.begin(), feature_set_h.end(), begin_idx);
+    feature_set_h = ColSample(feature_set_h, this->colsample_bytree_);
+  }
+
+  /**
+   * \brief Resets this object.
+   */
+  void Reset() {
+    feature_set_tree_.HostVector().clear();
+    feature_set_level_.clear();
+  }
+
+  HostDeviceVector<int>& GetFeatureSet(int depth) {
+    if (this->colsample_bylevel_ == 1.0f) {
+      return feature_set_tree_;
+    }
+
+    if (feature_set_level_.count(depth) == 0) {
+      // Level sampling, level does not yet exist so generate it
+      auto& level = feature_set_level_[depth].HostVector();
+      level = ColSample(feature_set_tree_.HostVector(), this->colsample_bylevel_);
+    }
+    // Level sampling
+    return feature_set_level_[depth];
+  }
+};
+
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_RANDOM_H_