Remove feature grouping (#7018)

Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
This commit is contained in:
ShvetsKS
2021-06-02 23:35:26 +03:00
committed by GitHub
parent 05db6a6c29
commit 5cdaac00c1
6 changed files with 22 additions and 420 deletions

View File

@@ -80,8 +80,6 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
// percentage threshold for treating a feature as sparse
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
double sparse_threshold;
// use feature grouping? (default yes)
int enable_feature_grouping;
// when grouping features, how many "conflicts" to allow.
// conflict is when an instance has nonzero values for two or more features
// default is 0, meaning features should be strictly complementary
@@ -199,9 +197,6 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
// ------ From cpu quantile histogram -------.
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
.describe("percentage threshold for treating a feature as sparse");
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
.describe("if >0, enable feature grouping to ameliorate work imbalance "
"among worker threads");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
.describe("when grouping features, how many \"conflicts\" to allow."
"conflict is when an instance has nonzero values for two or more features."

View File

@@ -71,7 +71,7 @@ void QuantileHistMaker::CallBuilderUpdate(const std::unique_ptr<Builder<Gradient
DMatrix *dmat,
const std::vector<RegTree *> &trees) {
for (auto tree : trees) {
builder->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
builder->Update(gmat_, column_matrix_, gpair, dmat, tree);
}
}
void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
@@ -81,9 +81,6 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
updater_monitor_.Start("GmatInitialization");
gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
column_matrix_.Init(gmat_, param_.sparse_threshold);
if (param_.enable_feature_grouping > 0) {
gmatb_.Init(gmat_, column_matrix_, param_);
}
updater_monitor_.Stop("GmatInitialization");
// A proper solution is puting cut matrix in DMatrix, see:
// https://github.com/dmlc/xgboost/issues/5143
@@ -295,7 +292,6 @@ void QuantileHistMaker::Builder<GradientSumT>::SetHistRowsAdder(
template <typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
const GHistIndexMatrix &gmat,
const GHistIndexBlockMatrix &gmatb,
const DMatrix& fmat,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h,
@@ -311,7 +307,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
int sync_count = 0;
hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
BuildLocalHistograms(gmat, gmatb, p_tree, gpair_h);
BuildLocalHistograms(gmat, p_tree, gpair_h);
hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
this->InitNewNode(CPUExpandEntry::kRootNid, gmat, gpair_h, fmat, *p_tree);
@@ -325,7 +321,6 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
template<typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::BuildLocalHistograms(
const GHistIndexMatrix &gmat,
const GHistIndexBlockMatrix &gmatb,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h) {
builder_monitor_.Start("BuildLocalHistograms");
@@ -355,7 +350,7 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildLocalHistograms(
auto rid_set = RowSetCollection::Elem(start_of_row_set + r.begin(),
start_of_row_set + r.end(),
nid);
BuildHist(gpair_h, rid_set, gmat, gmatb, hist_buffer_.GetInitializedHist(tid, nid_in_set));
BuildHist(gpair_h, rid_set, gmat, hist_buffer_.GetInitializedHist(tid, nid_in_set));
});
builder_monitor_.Stop("BuildLocalHistograms");
@@ -446,7 +441,6 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildNodeStats(
template<typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const ColumnMatrix& column_matrix,
DMatrix* p_fmat,
RegTree* p_tree,
@@ -456,7 +450,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
std::vector<CPUExpandEntry> expand;
InitRoot(gmat, gmatb, *p_fmat, p_tree, gpair_h, &num_leaves, &expand);
InitRoot(gmat, *p_fmat, p_tree, gpair_h, &num_leaves, &expand);
driver.Push(expand[0]);
int depth = 0;
@@ -478,7 +472,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
int sync_count = 0;
hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree);
if (depth < param_.max_depth) {
BuildLocalHistograms(gmat, gmatb, p_tree, gpair_h);
BuildLocalHistograms(gmat, p_tree, gpair_h);
hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree);
}
@@ -506,8 +500,9 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
template <typename GradientSumT>
void QuantileHistMaker::Builder<GradientSumT>::Update(
const GHistIndexMatrix &gmat, const GHistIndexBlockMatrix &gmatb,
const ColumnMatrix &column_matrix, HostDeviceVector<GradientPair> *gpair,
const GHistIndexMatrix &gmat,
const ColumnMatrix &column_matrix,
HostDeviceVector<GradientPair> *gpair,
DMatrix *p_fmat, RegTree *p_tree) {
builder_monitor_.Start("Update");
@@ -525,7 +520,7 @@ void QuantileHistMaker::Builder<GradientSumT>::Update(
this->InitData(gmat, *p_fmat, *p_tree, gpair_ptr);
ExpandTree(gmat, gmatb, column_matrix, p_fmat, p_tree, *gpair_ptr);
ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr);
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;

View File

@@ -111,7 +111,6 @@ class MemStackAllocator {
namespace tree {
using xgboost::common::GHistIndexMatrix;
using xgboost::common::GHistIndexBlockMatrix;
using xgboost::common::GHistIndexRow;
using xgboost::common::HistCollection;
using xgboost::common::RowSetCollection;
@@ -245,8 +244,6 @@ class QuantileHistMaker: public TreeUpdater {
TrainParam param_;
// quantized data matrix
GHistIndexMatrix gmat_;
// (optional) data matrix with feature grouping
GHistIndexBlockMatrix gmatb_;
// column accessor
ColumnMatrix column_matrix_;
DMatrix const* p_last_dmat_ {nullptr};
@@ -289,7 +286,6 @@ class QuantileHistMaker: public TreeUpdater {
}
// update one tree, growing
virtual void Update(const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const ColumnMatrix& column_matrix,
HostDeviceVector<GradientPair>* gpair,
DMatrix* p_fmat,
@@ -298,14 +294,9 @@ class QuantileHistMaker: public TreeUpdater {
inline void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
GHistRowT hist) {
if (param_.enable_feature_grouping > 0) {
hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist);
} else {
hist_builder_.BuildHist(gpair, row_indices, gmat, hist,
data_layout_ != DataLayout::kSparseData);
}
hist_builder_.BuildHist(gpair, row_indices, gmat, hist,
data_layout_ != DataLayout::kSparseData);
}
inline void SubtractionTrick(GHistRowT self,
@@ -386,12 +377,10 @@ class QuantileHistMaker: public TreeUpdater {
bool SplitContainsMissingValues(const GradStats e, const NodeEntry& snode);
void BuildLocalHistograms(const GHistIndexMatrix &gmat,
const GHistIndexBlockMatrix &gmatb,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h);
void InitRoot(const GHistIndexMatrix &gmat,
const GHistIndexBlockMatrix &gmatb,
const DMatrix& fmat,
RegTree *p_tree,
const std::vector<GradientPair> &gpair_h,
@@ -415,7 +404,6 @@ class QuantileHistMaker: public TreeUpdater {
const std::vector<CPUExpandEntry>& nodes_for_apply_split, RegTree *p_tree);
void ExpandTree(const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const ColumnMatrix& column_matrix,
DMatrix* p_fmat,
RegTree* p_tree,