Patch to improve multithreaded performance scaling (#2493)

* Patch to improve multithreaded performance scaling Change parallel strategy for histogram construction. Instead of partitioning data rows among multiple threads, partition feature columns instead. Useful heuristics for assigning partitions have been adopted from LightGBM project. * Add missing header to satisfy MSVC * Restore max_bin and related parameters to TrainParam * Fix lint error * inline functions do not require static keyword * Feature grouping algorithm accepting FastHistParam Feature grouping algorithm accepts many parameters (3+), and it gets annoying to pass them one by one. Instead, simply pass the reference to FastHistParam. The definition of FastHistParam has been moved to a separate header file to accomodate this change.
2017-07-07 08:25:07 -07:00
parent 6bfc472bec
commit ba820847f9
6 changed files with 466 additions and 52 deletions
--- a/src/tree/fast_hist_param.h
+++ b/src/tree/fast_hist_param.h
@@ -0,0 +1,64 @@
+/*!
+ * Copyright 2017 by Contributors
+ * \file updater_fast_hist.h
+ * \brief parameters for histogram-based training
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_FAST_HIST_PARAM_H_
+#define XGBOOST_TREE_FAST_HIST_PARAM_H_
+
+namespace xgboost {
+namespace tree {
+
+/*! \brief training parameters for histogram-based training */
+struct FastHistParam : public dmlc::Parameter<FastHistParam> {
+  // integral data type to be used with columnar data storage
+  enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
+  int colmat_dtype;
+  // percentage threshold for treating a feature as sparse
+  // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
+  double sparse_threshold;
+  // use feature grouping? (default yes)
+  int enable_feature_grouping;
+  // when grouping features, how many "conflicts" to allow.
+  // conflict is when an instance has nonzero values for two or more features
+  // default is 0, meaning features should be strictly complementary
+  double max_conflict_rate;
+  // when grouping features, how much effort to expend to prevent singleton groups
+  // we'll try to insert each feature into existing groups before creating a new group
+  // for that feature; to save time, only up to (max_search_group) of existing groups
+  // will be considered. If set to zero, ALL existing groups will be examined
+  unsigned max_search_group;
+
+  // declare the parameters
+  DMLC_DECLARE_PARAMETER(FastHistParam) {
+    DMLC_DECLARE_FIELD(colmat_dtype)
+        .set_default(static_cast<int>(DataType::uint32))
+        .add_enum("uint8", static_cast<int>(DataType::uint8))
+        .add_enum("uint16", static_cast<int>(DataType::uint16))
+        .add_enum("uint32", static_cast<int>(DataType::uint32))
+        .describe("Integral data type to be used with columnar data storage."
+                  "May carry marginal performance implications. Reserved for "
+                  "advanced use");
+    DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
+        .describe("percentage threshold for treating a feature as sparse");
+    DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(1)
+        .describe("if >0, enable feature grouping to ameliorate work imbalance "
+                  "among worker threads");
+    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
+        .describe("when grouping features, how many \"conflicts\" to allow."
+       "conflict is when an instance has nonzero values for two or more features."
+       "default is 0, meaning features should be strictly complementary.");
+    DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
+        .describe("when grouping features, how much effort to expend to prevent "
+                  "singleton groups. We'll try to insert each feature into existing "
+                  "groups before creating a new group for that feature; to save time, "
+                  "only up to (max_search_group) of existing groups will be "
+                  "considered. If set to zero, ALL existing groups will be examined.");
+  }
+};
+
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // XGBOOST_TREE_FAST_HIST_PARAM_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -30,8 +30,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  int max_leaves;
  // if using histogram based algorithm, maximum number of bins per feature
  int max_bin;
-  enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
-  int colmat_dtype;
  // growing policy
  enum TreeGrowPolicy { kDepthWise = 0, kLossGuide = 1 };
  int grow_policy;
@@ -111,14 +109,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
            "Tree growing policy. 0: favor splitting at nodes closest to the node, "
            "i.e. grow depth-wise. 1: favor splitting at nodes with highest loss "
            "change. (cf. LightGBM)");
-    DMLC_DECLARE_FIELD(colmat_dtype)
-        .set_default(static_cast<int>(DataType::uint32))
-        .add_enum("uint8", static_cast<int>(DataType::uint8))
-        .add_enum("uint16", static_cast<int>(DataType::uint16))
-        .add_enum("uint32", static_cast<int>(DataType::uint32))
-        .describe("Integral data type to be used with columnar data storage."
-                  "May carry marginal performance implications. Reserved for "
-                  "advanced use");
    DMLC_DECLARE_FIELD(min_child_weight)
        .set_lower_bound(0.0f)
        .set_default(1.0f)
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@@ -13,6 +13,7 @@
 #include <iomanip>
 #include <numeric>
 #include "./param.h"
+#include "./fast_hist_param.h"
 #include "../common/random.h"
 #include "../common/bitmap.h"
 #include "../common/sync.h"
@@ -25,6 +26,7 @@ namespace tree {

 using xgboost::common::HistCutMatrix;
 using xgboost::common::GHistIndexMatrix;
+using xgboost::common::GHistIndexBlockMatrix;
 using xgboost::common::GHistIndexRow;
 using xgboost::common::GHistEntry;
 using xgboost::common::HistCollection;
@@ -36,6 +38,8 @@ using xgboost::common::Column;

 DMLC_REGISTRY_FILE_TAG(updater_fast_hist);

+DMLC_REGISTER_PARAMETER(FastHistParam);
+
 /*! \brief construct a tree using quantized feature values */
 template<typename TStats, typename TConstraint>
 class FastHistMaker: public TreeUpdater {
@@ -47,6 +51,7 @@ class FastHistMaker: public TreeUpdater {
    }
    pruner_->Init(args);
    param.InitAllowUnknown(args);
+    fhparam.InitAllowUnknown(args);
    is_gmat_initialized_ = false;
  }

@@ -59,7 +64,10 @@ class FastHistMaker: public TreeUpdater {
      hmat_.Init(dmat, param.max_bin);
      gmat_.cut = &hmat_;
      gmat_.Init(dmat);
-      column_matrix_.Init(gmat_, static_cast<xgboost::common::DataType>(param.colmat_dtype));
+      column_matrix_.Init(gmat_, fhparam);
+      if (fhparam.enable_feature_grouping > 0) {
+        gmatb_.Init(gmat_, column_matrix_, fhparam);
+      }
      is_gmat_initialized_ = true;
      if (param.debug_verbose > 0) {
        LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
@@ -71,10 +79,10 @@ class FastHistMaker: public TreeUpdater {
    TConstraint::Init(&param, dmat->info().num_col);
    // build tree
    if (!builder_) {
-      builder_.reset(new Builder(param, std::move(pruner_)));
+      builder_.reset(new Builder(param, fhparam, std::move(pruner_)));
    }
    for (size_t i = 0; i < trees.size(); ++i) {
-      builder_->Update(gmat_, column_matrix_, gpair, dmat, trees[i]);
+      builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
    }
    param.learning_rate = lr;
  }
@@ -91,9 +99,13 @@ class FastHistMaker: public TreeUpdater {
 protected:
  // training parameter
  TrainParam param;
+  FastHistParam fhparam;
  // data sketch
  HistCutMatrix hmat_;
+  // quantized data matrix
  GHistIndexMatrix gmat_;
+  // (optional) data matrix with feature grouping
+  GHistIndexBlockMatrix gmatb_;
  // column accessor
  ColumnMatrix column_matrix_;
  bool is_gmat_initialized_;
@@ -136,11 +148,13 @@ class FastHistMaker: public TreeUpdater {
   public:
    // constructor
    explicit Builder(const TrainParam& param,
+                     const FastHistParam& fhparam,
                     std::unique_ptr<TreeUpdater> pruner)
-      : param(param), pruner_(std::move(pruner)),
+      : param(param), fhparam(fhparam), pruner_(std::move(pruner)),
        p_last_tree_(nullptr), p_last_fmat_(nullptr) {}
    // update one tree, growing
    virtual void Update(const GHistIndexMatrix& gmat,
+                        const GHistIndexBlockMatrix& gmatb,
                        const ColumnMatrix& column_matrix,
                        const std::vector<bst_gpair>& gpair,
                        DMatrix* p_fmat,
@@ -168,7 +182,7 @@ class FastHistMaker: public TreeUpdater {
      for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
        tstart = dmlc::GetTime();
        hist_.AddHistRow(nid);
-        builder_.BuildHist(gpair, row_set_collection_[nid], gmat, feat_set, hist_[nid]);
+        BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
        time_build_hist += dmlc::GetTime() - tstart;

        tstart = dmlc::GetTime();
@@ -203,13 +217,11 @@ class FastHistMaker: public TreeUpdater {
          hist_.AddHistRow(cleft);
          hist_.AddHistRow(cright);
          if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
-            builder_.BuildHist(gpair, row_set_collection_[cleft], gmat, feat_set,
-                               hist_[cleft]);
-            builder_.SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
+            BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
+            SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
          } else {
-            builder_.BuildHist(gpair, row_set_collection_[cright], gmat, feat_set,
-                               hist_[cright]);
-            builder_.SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
+            BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
+            SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
          }
          time_build_hist += dmlc::GetTime() - tstart;

@@ -280,6 +292,23 @@ class FastHistMaker: public TreeUpdater {
      }
    }

+    inline void BuildHist(const std::vector<bst_gpair>& gpair,
+                          const RowSetCollection::Elem row_indices,
+                          const GHistIndexMatrix& gmat,
+                          const GHistIndexBlockMatrix& gmatb,
+                          const std::vector<bst_uint>& feat_set,
+                          GHistRow hist) {
+      if (fhparam.enable_feature_grouping > 0) {
+        hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, feat_set, hist);
+      } else {
+        hist_builder_.BuildHist(gpair, row_indices, gmat, feat_set, hist);
+      }
+    }
+
+    inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
+      hist_builder_.SubtractionTrick(self, sibling, parent);
+    }
+
    inline bool UpdatePredictionCache(const DMatrix* data,
                                      std::vector<bst_float>* p_out_preds) {
      std::vector<bst_float>& out_preds = *p_out_preds;
@@ -351,7 +380,7 @@ class FastHistMaker: public TreeUpdater {
        {
          this->nthread = omp_get_num_threads();
        }
-        builder_.Init(this->nthread, nbins);
+        hist_builder_.Init(this->nthread, nbins);

        CHECK_EQ(info.root_index.size(), 0U);
        std::vector<bst_uint>& row_indices = row_set_collection_.row_indices_;
@@ -885,6 +914,7 @@ class FastHistMaker: public TreeUpdater {

    //  --data fields--
    const TrainParam& param;
+    const FastHistParam& fhparam;
    // number of omp thread used during training
    int nthread;
    // Per feature: shuffle index of each feature index
@@ -904,7 +934,7 @@ class FastHistMaker: public TreeUpdater {
    /*! \brief local prediction cache; maps node id to leaf value */
    std::vector<float> leaf_value_cache_;

-    GHistBuilder builder_;
+    GHistBuilder hist_builder_;
    std::unique_ptr<TreeUpdater> pruner_;

    // back pointers to tree and data matrix