Patch to improve multithreaded performance scaling (#2493)

* Patch to improve multithreaded performance scaling

Change parallel strategy for histogram construction.
Instead of partitioning data rows among multiple threads, partition feature
columns instead. Useful heuristics for assigning partitions have been adopted
from LightGBM project.

* Add missing header to satisfy MSVC

* Restore max_bin and related parameters to TrainParam

* Fix lint error

* inline functions do not require static keyword

* Feature grouping algorithm accepting FastHistParam

Feature grouping algorithm accepts many parameters (3+), and it gets annoying to
pass them one by one. Instead, simply pass the reference to FastHistParam. The
definition of FastHistParam has been moved to a separate header file to
accomodate this change.
This commit is contained in:
Philip Cho
2017-07-07 08:25:07 -07:00
committed by Tianqi Chen
parent 6bfc472bec
commit ba820847f9
6 changed files with 466 additions and 52 deletions

View File

@@ -0,0 +1,64 @@
/*!
* Copyright 2017 by Contributors
* \file updater_fast_hist.h
* \brief parameters for histogram-based training
* \author Philip Cho, Tianqi Chen
*/
#ifndef XGBOOST_TREE_FAST_HIST_PARAM_H_
#define XGBOOST_TREE_FAST_HIST_PARAM_H_
namespace xgboost {
namespace tree {
/*! \brief training parameters for histogram-based training */
struct FastHistParam : public dmlc::Parameter<FastHistParam> {
// integral data type to be used with columnar data storage
enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
int colmat_dtype;
// percentage threshold for treating a feature as sparse
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
double sparse_threshold;
// use feature grouping? (default yes)
int enable_feature_grouping;
// when grouping features, how many "conflicts" to allow.
// conflict is when an instance has nonzero values for two or more features
// default is 0, meaning features should be strictly complementary
double max_conflict_rate;
// when grouping features, how much effort to expend to prevent singleton groups
// we'll try to insert each feature into existing groups before creating a new group
// for that feature; to save time, only up to (max_search_group) of existing groups
// will be considered. If set to zero, ALL existing groups will be examined
unsigned max_search_group;
// declare the parameters
DMLC_DECLARE_PARAMETER(FastHistParam) {
DMLC_DECLARE_FIELD(colmat_dtype)
.set_default(static_cast<int>(DataType::uint32))
.add_enum("uint8", static_cast<int>(DataType::uint8))
.add_enum("uint16", static_cast<int>(DataType::uint16))
.add_enum("uint32", static_cast<int>(DataType::uint32))
.describe("Integral data type to be used with columnar data storage."
"May carry marginal performance implications. Reserved for "
"advanced use");
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
.describe("percentage threshold for treating a feature as sparse");
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(1)
.describe("if >0, enable feature grouping to ameliorate work imbalance "
"among worker threads");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
.describe("when grouping features, how many \"conflicts\" to allow."
"conflict is when an instance has nonzero values for two or more features."
"default is 0, meaning features should be strictly complementary.");
DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
.describe("when grouping features, how much effort to expend to prevent "
"singleton groups. We'll try to insert each feature into existing "
"groups before creating a new group for that feature; to save time, "
"only up to (max_search_group) of existing groups will be "
"considered. If set to zero, ALL existing groups will be examined.");
}
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_FAST_HIST_PARAM_H_

View File

@@ -30,8 +30,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
int max_leaves;
// if using histogram based algorithm, maximum number of bins per feature
int max_bin;
enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
int colmat_dtype;
// growing policy
enum TreeGrowPolicy { kDepthWise = 0, kLossGuide = 1 };
int grow_policy;
@@ -111,14 +109,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
"Tree growing policy. 0: favor splitting at nodes closest to the node, "
"i.e. grow depth-wise. 1: favor splitting at nodes with highest loss "
"change. (cf. LightGBM)");
DMLC_DECLARE_FIELD(colmat_dtype)
.set_default(static_cast<int>(DataType::uint32))
.add_enum("uint8", static_cast<int>(DataType::uint8))
.add_enum("uint16", static_cast<int>(DataType::uint16))
.add_enum("uint32", static_cast<int>(DataType::uint32))
.describe("Integral data type to be used with columnar data storage."
"May carry marginal performance implications. Reserved for "
"advanced use");
DMLC_DECLARE_FIELD(min_child_weight)
.set_lower_bound(0.0f)
.set_default(1.0f)

View File

@@ -13,6 +13,7 @@
#include <iomanip>
#include <numeric>
#include "./param.h"
#include "./fast_hist_param.h"
#include "../common/random.h"
#include "../common/bitmap.h"
#include "../common/sync.h"
@@ -25,6 +26,7 @@ namespace tree {
using xgboost::common::HistCutMatrix;
using xgboost::common::GHistIndexMatrix;
using xgboost::common::GHistIndexBlockMatrix;
using xgboost::common::GHistIndexRow;
using xgboost::common::GHistEntry;
using xgboost::common::HistCollection;
@@ -36,6 +38,8 @@ using xgboost::common::Column;
DMLC_REGISTRY_FILE_TAG(updater_fast_hist);
DMLC_REGISTER_PARAMETER(FastHistParam);
/*! \brief construct a tree using quantized feature values */
template<typename TStats, typename TConstraint>
class FastHistMaker: public TreeUpdater {
@@ -47,6 +51,7 @@ class FastHistMaker: public TreeUpdater {
}
pruner_->Init(args);
param.InitAllowUnknown(args);
fhparam.InitAllowUnknown(args);
is_gmat_initialized_ = false;
}
@@ -59,7 +64,10 @@ class FastHistMaker: public TreeUpdater {
hmat_.Init(dmat, param.max_bin);
gmat_.cut = &hmat_;
gmat_.Init(dmat);
column_matrix_.Init(gmat_, static_cast<xgboost::common::DataType>(param.colmat_dtype));
column_matrix_.Init(gmat_, fhparam);
if (fhparam.enable_feature_grouping > 0) {
gmatb_.Init(gmat_, column_matrix_, fhparam);
}
is_gmat_initialized_ = true;
if (param.debug_verbose > 0) {
LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
@@ -71,10 +79,10 @@ class FastHistMaker: public TreeUpdater {
TConstraint::Init(&param, dmat->info().num_col);
// build tree
if (!builder_) {
builder_.reset(new Builder(param, std::move(pruner_)));
builder_.reset(new Builder(param, fhparam, std::move(pruner_)));
}
for (size_t i = 0; i < trees.size(); ++i) {
builder_->Update(gmat_, column_matrix_, gpair, dmat, trees[i]);
builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
}
param.learning_rate = lr;
}
@@ -91,9 +99,13 @@ class FastHistMaker: public TreeUpdater {
protected:
// training parameter
TrainParam param;
FastHistParam fhparam;
// data sketch
HistCutMatrix hmat_;
// quantized data matrix
GHistIndexMatrix gmat_;
// (optional) data matrix with feature grouping
GHistIndexBlockMatrix gmatb_;
// column accessor
ColumnMatrix column_matrix_;
bool is_gmat_initialized_;
@@ -136,11 +148,13 @@ class FastHistMaker: public TreeUpdater {
public:
// constructor
explicit Builder(const TrainParam& param,
const FastHistParam& fhparam,
std::unique_ptr<TreeUpdater> pruner)
: param(param), pruner_(std::move(pruner)),
: param(param), fhparam(fhparam), pruner_(std::move(pruner)),
p_last_tree_(nullptr), p_last_fmat_(nullptr) {}
// update one tree, growing
virtual void Update(const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const ColumnMatrix& column_matrix,
const std::vector<bst_gpair>& gpair,
DMatrix* p_fmat,
@@ -168,7 +182,7 @@ class FastHistMaker: public TreeUpdater {
for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
tstart = dmlc::GetTime();
hist_.AddHistRow(nid);
builder_.BuildHist(gpair, row_set_collection_[nid], gmat, feat_set, hist_[nid]);
BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
time_build_hist += dmlc::GetTime() - tstart;
tstart = dmlc::GetTime();
@@ -203,13 +217,11 @@ class FastHistMaker: public TreeUpdater {
hist_.AddHistRow(cleft);
hist_.AddHistRow(cright);
if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
builder_.BuildHist(gpair, row_set_collection_[cleft], gmat, feat_set,
hist_[cleft]);
builder_.SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
} else {
builder_.BuildHist(gpair, row_set_collection_[cright], gmat, feat_set,
hist_[cright]);
builder_.SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
}
time_build_hist += dmlc::GetTime() - tstart;
@@ -280,6 +292,23 @@ class FastHistMaker: public TreeUpdater {
}
}
inline void BuildHist(const std::vector<bst_gpair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const std::vector<bst_uint>& feat_set,
GHistRow hist) {
if (fhparam.enable_feature_grouping > 0) {
hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, feat_set, hist);
} else {
hist_builder_.BuildHist(gpair, row_indices, gmat, feat_set, hist);
}
}
inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
hist_builder_.SubtractionTrick(self, sibling, parent);
}
inline bool UpdatePredictionCache(const DMatrix* data,
std::vector<bst_float>* p_out_preds) {
std::vector<bst_float>& out_preds = *p_out_preds;
@@ -351,7 +380,7 @@ class FastHistMaker: public TreeUpdater {
{
this->nthread = omp_get_num_threads();
}
builder_.Init(this->nthread, nbins);
hist_builder_.Init(this->nthread, nbins);
CHECK_EQ(info.root_index.size(), 0U);
std::vector<bst_uint>& row_indices = row_set_collection_.row_indices_;
@@ -885,6 +914,7 @@ class FastHistMaker: public TreeUpdater {
// --data fields--
const TrainParam& param;
const FastHistParam& fhparam;
// number of omp thread used during training
int nthread;
// Per feature: shuffle index of each feature index
@@ -904,7 +934,7 @@ class FastHistMaker: public TreeUpdater {
/*! \brief local prediction cache; maps node id to leaf value */
std::vector<float> leaf_value_cache_;
GHistBuilder builder_;
GHistBuilder hist_builder_;
std::unique_ptr<TreeUpdater> pruner_;
// back pointers to tree and data matrix