Column sampling at individual nodes (splits). (#3971)

* Column sampling at individual nodes (splits). * Documented colsample_bynode parameter. - also updated documentation for colsample_by* parameters * Updated documentation. * GetFeatureSet() returns shared pointer to std::vector. * Sync sampled columns across multiple processes.
2018-12-14 15:37:35 +01:00
parent e0a279114e
commit 42bf90eb8f
8 changed files with 140 additions and 80 deletions
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -7,14 +7,15 @@
 #ifndef XGBOOST_COMMON_RANDOM_H_
 #define XGBOOST_COMMON_RANDOM_H_

+#include <rabit/rabit.h>
 #include <xgboost/logging.h>
 #include <algorithm>
 #include <vector>
 #include <limits>
 #include <map>
+#include <memory>
 #include <numeric>
 #include <random>
-#include "host_device_vector.h"

 namespace xgboost {
 namespace common {
@@ -75,27 +76,36 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
 /**
 * \class ColumnSampler
 *
- * \brief Handles selection of columns due to colsample_bytree and
- * colsample_bylevel parameters. Should be initialised before tree
- * construction and to reset when tree construction is completed.
+ * \brief Handles selection of columns due to colsample_bytree, colsample_bylevel and
+ * colsample_bynode parameters. Should be initialised before tree construction and to
+ * reset when tree construction is completed.
 */

 class ColumnSampler {
-  HostDeviceVector<int> feature_set_tree_;
-  std::map<int, HostDeviceVector<int>> feature_set_level_;
+  std::shared_ptr<std::vector<int>> feature_set_tree_;
+  std::map<int, std::shared_ptr<std::vector<int>>> feature_set_level_;
  float colsample_bylevel_{1.0f};
  float colsample_bytree_{1.0f};
+  float colsample_bynode_{1.0f};

-  std::vector<int> ColSample(std::vector<int> features, float colsample) const {
-    if (colsample == 1.0f) return features;
+  std::shared_ptr<std::vector<int>> ColSample
+    (std::shared_ptr<std::vector<int>> p_features, float colsample) const {
+    if (colsample == 1.0f) return p_features;
+    const auto& features = *p_features;
    CHECK_GT(features.size(), 0);
    int n = std::max(1, static_cast<int>(colsample * features.size()));
+    auto p_new_features = std::make_shared<std::vector<int>>();
+    auto& new_features = *p_new_features;
+    new_features.resize(features.size());
+    std::copy(features.begin(), features.end(), new_features.begin());
+    std::shuffle(new_features.begin(), new_features.end(), common::GlobalRandom());
+    new_features.resize(n);
+    std::sort(new_features.begin(), new_features.end());

-    std::shuffle(features.begin(), features.end(), common::GlobalRandom());
-    features.resize(n);
-    std::sort(features.begin(), features.end());
+    // ensure that new_features are the same across ranks
+    rabit::Broadcast(&new_features, 0);

-    return features;
+    return p_new_features;
  }

 public:
@@ -103,44 +113,60 @@ class ColumnSampler {
   * \brief Initialise this object before use.
   *
   * \param num_col
+   * \param colsample_bynode
   * \param colsample_bylevel
   * \param colsample_bytree
   * \param skip_index_0      (Optional) True to skip index 0.
   */
-  void Init(int64_t num_col, float colsample_bylevel, float colsample_bytree,
-            bool skip_index_0 = false) {
-    this->colsample_bylevel_ = colsample_bylevel;
-    this->colsample_bytree_ = colsample_bytree;
-    this->Reset();
+  void Init(int64_t num_col, float colsample_bynode, float colsample_bylevel,
+            float colsample_bytree, bool skip_index_0 = false) {
+    colsample_bylevel_ = colsample_bylevel;
+    colsample_bytree_ = colsample_bytree;
+    colsample_bynode_ = colsample_bynode;
+
+    if (feature_set_tree_ == nullptr) {
+      feature_set_tree_ = std::make_shared<std::vector<int>>();
+    }
+    Reset();

    int begin_idx = skip_index_0 ? 1 : 0;
-    auto& feature_set_h = feature_set_tree_.HostVector();
-    feature_set_h.resize(num_col - begin_idx);
+    feature_set_tree_->resize(num_col - begin_idx);
+    std::iota(feature_set_tree_->begin(), feature_set_tree_->end(), begin_idx);

-    std::iota(feature_set_h.begin(), feature_set_h.end(), begin_idx);
-    feature_set_h = ColSample(feature_set_h, this->colsample_bytree_);
+    feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
  }

  /**
   * \brief Resets this object.
   */
  void Reset() {
-    feature_set_tree_.HostVector().clear();
+    feature_set_tree_->clear();
    feature_set_level_.clear();
  }

-  HostDeviceVector<int>& GetFeatureSet(int depth) {
-    if (this->colsample_bylevel_ == 1.0f) {
+  /**
+   * \brief Samples a feature set.
+   * 
+   * \param depth The tree depth of the node at which to sample.
+   * \return The sampled feature set.
+   * \note If colsample_bynode_ < 1.0, this method creates a new feature set each time it
+   * is called. Therefore, it should be called only once per node.
+   */
+  std::shared_ptr<std::vector<int>> GetFeatureSet(int depth) {
+    if (colsample_bylevel_ == 1.0f && colsample_bynode_ == 1.0f) {
      return feature_set_tree_;
    }

    if (feature_set_level_.count(depth) == 0) {
      // Level sampling, level does not yet exist so generate it
-      auto& level = feature_set_level_[depth].HostVector();
-      level = ColSample(feature_set_tree_.HostVector(), this->colsample_bylevel_);
+      feature_set_level_[depth] = ColSample(feature_set_tree_, colsample_bylevel_);
    }
-    // Level sampling
-    return feature_set_level_[depth];
+    if (colsample_bynode_ == 1.0f) {
+      // Level sampling
+      return feature_set_level_[depth];
+    }
+    // Need to sample for the node individually
+    return ColSample(feature_set_level_[depth], colsample_bynode_);
  }
 };

--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -50,7 +50,9 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  float max_delta_step;
  // whether we want to do subsample
  float subsample;
-  // whether to subsample columns each split, in each level
+  // whether to subsample columns in each split (node)
+  float colsample_bynode;
+  // whether to subsample columns in each level
  float colsample_bylevel;
  // whether to subsample columns during tree construction
  float colsample_bytree;
@@ -149,6 +151,10 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
        .set_range(0.0f, 1.0f)
        .set_default(1.0f)
        .describe("Row subsample ratio of training instance.");
+    DMLC_DECLARE_FIELD(colsample_bynode)
+        .set_range(0.0f, 1.0f)
+        .set_default(1.0f)
+        .describe("Subsample ratio of columns, resample on each node (split).");
    DMLC_DECLARE_FIELD(colsample_bylevel)
        .set_range(0.0f, 1.0f)
        .set_default(1.0f)
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -168,8 +168,8 @@ class ColMaker: public TreeUpdater {
        }
      }
      {
-        column_sampler_.Init(fmat.Info().num_col_, param_.colsample_bylevel,
-                             param_.colsample_bytree);
+        column_sampler_.Init(fmat.Info().num_col_, param_.colsample_bynode,
+                             param_.colsample_bylevel, param_.colsample_bytree);
      }
      {
        // setup temp space for each thread
@@ -625,7 +625,8 @@ class ColMaker: public TreeUpdater {
                          const std::vector<GradientPair> &gpair,
                          DMatrix *p_fmat,
                          RegTree *p_tree) {
-      const std::vector<int> &feat_set = column_sampler_.GetFeatureSet(depth).HostVector();
+      auto p_feature_set = column_sampler_.GetFeatureSet(depth);
+      const auto& feat_set = *p_feature_set;
      for (const auto &batch : p_fmat->GetSortedColumnBatches()) {
        this->UpdateSolution(batch, feat_set, gpair, p_fmat);
      }
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -499,6 +499,8 @@ struct DeviceShard {
  dh::DVec<GradientPair> node_sum_gradients_d;
  /*! \brief row offset in SparsePage (the input data). */
  thrust::device_vector<size_t> row_ptrs;
+  /*! \brief On-device feature set, only actually used on one of the devices */
+  thrust::device_vector<int> feature_set_d;
  /*! The row offset for this shard. */
  bst_uint row_begin_idx;
  bst_uint row_end_idx;
@@ -579,28 +581,31 @@ struct DeviceShard {
  }

  DeviceSplitCandidate EvaluateSplit(int nidx,
-                                     const HostDeviceVector<int>& feature_set,
+                                     const std::vector<int>& feature_set,
                                     ValueConstraint value_constraint) {
    dh::safe_cuda(cudaSetDevice(device_id_));
-    auto d_split_candidates = temp_memory.GetSpan<DeviceSplitCandidate>(feature_set.Size());
+    auto d_split_candidates = temp_memory.GetSpan<DeviceSplitCandidate>(feature_set.size());
+    feature_set_d.resize(feature_set.size());
+    auto d_features = common::Span<int>(feature_set_d.data().get(),
+                                        feature_set_d.size());
+    dh::safe_cuda(cudaMemcpy(d_features.data(), feature_set.data(),
+                             d_features.size_bytes(), cudaMemcpyDefault));
    DeviceNodeStats node(node_sum_gradients[nidx], nidx, param);
-    feature_set.Reshard(GPUSet::Range(device_id_, 1));

    // One block for each feature
    int constexpr BLOCK_THREADS = 256;
    EvaluateSplitKernel<BLOCK_THREADS, GradientSumT>
-        <<<uint32_t(feature_set.Size()), BLOCK_THREADS, 0>>>(
-            hist.GetNodeHistogram(nidx), feature_set.DeviceSpan(device_id_), node,
-            cut_.feature_segments.GetSpan(), cut_.min_fvalue.GetSpan(),
-            cut_.gidx_fvalue_map.GetSpan(), GPUTrainingParam(param),
-            d_split_candidates, value_constraint, monotone_constraints.GetSpan());
+      <<<uint32_t(feature_set.size()), BLOCK_THREADS, 0>>>
+      (hist.GetNodeHistogram(nidx), d_features, node,
+       cut_.feature_segments.GetSpan(), cut_.min_fvalue.GetSpan(),
+       cut_.gidx_fvalue_map.GetSpan(), GPUTrainingParam(param),
+       d_split_candidates, value_constraint, monotone_constraints.GetSpan());

    dh::safe_cuda(cudaDeviceSynchronize());
-    std::vector<DeviceSplitCandidate> split_candidates(feature_set.Size());
-    dh::safe_cuda(
-        cudaMemcpy(split_candidates.data(), d_split_candidates.data(),
-                   split_candidates.size() * sizeof(DeviceSplitCandidate),
-                   cudaMemcpyDeviceToHost));
+    std::vector<DeviceSplitCandidate> split_candidates(feature_set.size());
+    dh::safe_cuda(cudaMemcpy(split_candidates.data(), d_split_candidates.data(),
+                             split_candidates.size() * sizeof(DeviceSplitCandidate),
+                             cudaMemcpyDeviceToHost));
    DeviceSplitCandidate best_split;
    for (auto candidate : split_candidates) {
      best_split.Update(candidate, param);
@@ -1009,7 +1014,8 @@ class GPUHistMakerSpecialised{
    }
    monitor_.Stop("InitDataOnce", dist_.Devices());

-    column_sampler_.Init(info_->num_col_, param_.colsample_bylevel, param_.colsample_bytree);
+    column_sampler_.Init(info_->num_col_, param_.colsample_bynode,
+                         param_.colsample_bylevel, param_.colsample_bytree);

    // Copy gpair & reset memory
    monitor_.Start("InitDataReset", dist_.Devices());
@@ -1100,7 +1106,7 @@ class GPUHistMakerSpecialised{

  DeviceSplitCandidate EvaluateSplit(int nidx, RegTree* p_tree) {
    return shards_.front()->EvaluateSplit(
-        nidx, column_sampler_.GetFeatureSet(p_tree->GetDepth(nidx)),
+        nidx, *column_sampler_.GetFeatureSet(p_tree->GetDepth(nidx)),
        node_value_constraints_[nidx]);
  }

--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -354,11 +354,11 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
    p_last_fmat_ = &fmat;
    // initialize feature index
    if (data_layout_ == kDenseDataOneBased) {
-      column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
-                           param_.colsample_bytree, true);
+      column_sampler_.Init(info.num_col_, param_.colsample_bynode,
+                           param_.colsample_bylevel, param_.colsample_bytree, true);
    } else {
-      column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
-                           param_.colsample_bytree, false);
+      column_sampler_.Init(info.num_col_, param_.colsample_bynode,
+                           param_.colsample_bylevel, param_.colsample_bytree,  false);
    }
  }
  if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
@@ -400,8 +400,8 @@ void QuantileHistMaker::Builder::EvaluateSplit(int nid,
                                           const RegTree& tree) {
  // start enumeration
  const MetaInfo& info = fmat.Info();
-  const auto& feature_set = column_sampler_.GetFeatureSet(
-      tree.GetDepth(nid)).HostVector();
+  auto p_feature_set = column_sampler_.GetFeatureSet(tree.GetDepth(nid));
+  const auto& feature_set = *p_feature_set;
  const auto nfeature = static_cast<bst_uint>(feature_set.size());
  const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
  best_split_tloc_.resize(nthread);