Remove feature grouping (#7018)
Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
This commit is contained in:
@@ -187,267 +187,6 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename BinIdxType>
|
||||
static size_t GetConflictCount(const std::vector<bool>& mark,
|
||||
const Column<BinIdxType>& column_input,
|
||||
size_t max_cnt) {
|
||||
size_t ret = 0;
|
||||
if (column_input.GetType() == xgboost::common::kDenseColumn) {
|
||||
const DenseColumn<BinIdxType>& column
|
||||
= static_cast<const DenseColumn<BinIdxType>& >(column_input);
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
if ((!column.IsMissing(i)) && mark[i]) {
|
||||
++ret;
|
||||
if (ret > max_cnt) {
|
||||
return max_cnt + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const SparseColumn<BinIdxType>& column
|
||||
= static_cast<const SparseColumn<BinIdxType>& >(column_input);
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
if (mark[column.GetRowIdx(i)]) {
|
||||
++ret;
|
||||
if (ret > max_cnt) {
|
||||
return max_cnt + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename BinIdxType>
|
||||
inline void
|
||||
MarkUsed(std::vector<bool>* p_mark, const Column<BinIdxType>& column_input) {
|
||||
std::vector<bool>& mark = *p_mark;
|
||||
if (column_input.GetType() == xgboost::common::kDenseColumn) {
|
||||
const DenseColumn<BinIdxType>& column
|
||||
= static_cast<const DenseColumn<BinIdxType>& >(column_input);
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
if (!column.IsMissing(i)) {
|
||||
mark[i] = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const SparseColumn<BinIdxType>& column
|
||||
= static_cast<const SparseColumn<BinIdxType>& >(column_input);
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
mark[column.GetRowIdx(i)] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename BinIdxType>
|
||||
inline void SetGroup(const unsigned fid, const Column<BinIdxType>& column,
|
||||
const size_t max_conflict_cnt, const std::vector<size_t>& search_groups,
|
||||
std::vector<size_t>* p_group_conflict_cnt,
|
||||
std::vector<std::vector<bool>>* p_conflict_marks,
|
||||
std::vector<std::vector<unsigned>>* p_groups,
|
||||
std::vector<size_t>* p_group_nnz, const size_t cur_fid_nnz, const size_t nrow) {
|
||||
bool need_new_group = true;
|
||||
std::vector<size_t>& group_conflict_cnt = *p_group_conflict_cnt;
|
||||
std::vector<std::vector<bool>>& conflict_marks = *p_conflict_marks;
|
||||
std::vector<std::vector<unsigned>>& groups = *p_groups;
|
||||
std::vector<size_t>& group_nnz = *p_group_nnz;
|
||||
|
||||
// examine each candidate group: is it okay to insert fid?
|
||||
for (auto gid : search_groups) {
|
||||
const size_t rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
|
||||
const size_t cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
|
||||
if (cnt <= rest_max_cnt) {
|
||||
need_new_group = false;
|
||||
groups[gid].push_back(fid);
|
||||
group_conflict_cnt[gid] += cnt;
|
||||
group_nnz[gid] += cur_fid_nnz - cnt;
|
||||
MarkUsed(&conflict_marks[gid], column);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// create new group if necessary
|
||||
if (need_new_group) {
|
||||
groups.emplace_back();
|
||||
groups.back().push_back(fid);
|
||||
group_conflict_cnt.push_back(0);
|
||||
conflict_marks.emplace_back(nrow, false);
|
||||
MarkUsed(&conflict_marks.back(), column);
|
||||
group_nnz.emplace_back(cur_fid_nnz);
|
||||
}
|
||||
}
|
||||
|
||||
inline std::vector<std::vector<unsigned>>
|
||||
FindGroups(const std::vector<unsigned>& feature_list,
|
||||
const std::vector<size_t>& feature_nnz,
|
||||
const ColumnMatrix& colmat,
|
||||
size_t nrow,
|
||||
const tree::TrainParam& param) {
|
||||
/* Goal: Bundle features together that has little or no "overlap", i.e.
|
||||
only a few data points should have nonzero values for
|
||||
member features.
|
||||
Note that one-hot encoded features will be grouped together. */
|
||||
|
||||
std::vector<std::vector<unsigned>> groups;
|
||||
std::vector<std::vector<bool>> conflict_marks;
|
||||
std::vector<size_t> group_nnz;
|
||||
std::vector<size_t> group_conflict_cnt;
|
||||
const auto max_conflict_cnt
|
||||
= static_cast<size_t>(param.max_conflict_rate * nrow);
|
||||
|
||||
for (auto fid : feature_list) {
|
||||
const size_t cur_fid_nnz = feature_nnz[fid];
|
||||
|
||||
// randomly choose some of existing groups as candidates
|
||||
std::vector<size_t> search_groups;
|
||||
for (size_t gid = 0; gid < groups.size(); ++gid) {
|
||||
if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
|
||||
search_groups.push_back(gid);
|
||||
}
|
||||
}
|
||||
std::shuffle(search_groups.begin(), search_groups.end(), common::GlobalRandom());
|
||||
if (param.max_search_group > 0 && search_groups.size() > param.max_search_group) {
|
||||
search_groups.resize(param.max_search_group);
|
||||
}
|
||||
|
||||
BinTypeSize bins_type_size = colmat.GetTypeSize();
|
||||
if (bins_type_size == kUint8BinsTypeSize) {
|
||||
const auto column = colmat.GetColumn<uint8_t>(fid);
|
||||
SetGroup(fid, *(column.get()), max_conflict_cnt, search_groups,
|
||||
&group_conflict_cnt, &conflict_marks, &groups, &group_nnz, cur_fid_nnz, nrow);
|
||||
} else if (bins_type_size == kUint16BinsTypeSize) {
|
||||
const auto column = colmat.GetColumn<uint16_t>(fid);
|
||||
SetGroup(fid, *(column.get()), max_conflict_cnt, search_groups,
|
||||
&group_conflict_cnt, &conflict_marks, &groups, &group_nnz, cur_fid_nnz, nrow);
|
||||
} else {
|
||||
CHECK_EQ(bins_type_size, kUint32BinsTypeSize);
|
||||
const auto column = colmat.GetColumn<uint32_t>(fid);
|
||||
SetGroup(fid, *(column.get()), max_conflict_cnt, search_groups,
|
||||
&group_conflict_cnt, &conflict_marks, &groups, &group_nnz, cur_fid_nnz, nrow);
|
||||
}
|
||||
}
|
||||
return groups;
|
||||
}
|
||||
|
||||
inline std::vector<std::vector<unsigned>>
|
||||
FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& colmat,
|
||||
const tree::TrainParam& param) {
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
const size_t nfeature = gmat.cut.Ptrs().size() - 1;
|
||||
|
||||
std::vector<unsigned> feature_list(nfeature);
|
||||
std::iota(feature_list.begin(), feature_list.end(), 0);
|
||||
|
||||
// sort features by nonzero counts, descending order
|
||||
std::vector<size_t> feature_nnz(nfeature);
|
||||
std::vector<unsigned> features_by_nnz(feature_list);
|
||||
gmat.GetFeatureCounts(&feature_nnz[0]);
|
||||
std::sort(features_by_nnz.begin(), features_by_nnz.end(),
|
||||
[&feature_nnz](unsigned a, unsigned b) {
|
||||
return feature_nnz[a] > feature_nnz[b];
|
||||
});
|
||||
|
||||
auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
|
||||
auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
|
||||
auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;
|
||||
|
||||
// take apart small, sparse groups, as it won't help speed
|
||||
{
|
||||
std::vector<std::vector<unsigned>> ret;
|
||||
for (const auto& group : groups) {
|
||||
if (group.size() <= 1 || group.size() >= 5) {
|
||||
ret.push_back(group); // keep singleton groups and large (5+) groups
|
||||
} else {
|
||||
size_t nnz = 0;
|
||||
for (auto fid : group) {
|
||||
nnz += feature_nnz[fid];
|
||||
}
|
||||
double nnz_rate = static_cast<double>(nnz) / nrow;
|
||||
// take apart small sparse group, due it will not gain on speed
|
||||
if (nnz_rate <= param.sparse_threshold) {
|
||||
for (auto fid : group) {
|
||||
ret.emplace_back();
|
||||
ret.back().push_back(fid);
|
||||
}
|
||||
} else {
|
||||
ret.push_back(group);
|
||||
}
|
||||
}
|
||||
}
|
||||
groups = std::move(ret);
|
||||
}
|
||||
|
||||
// shuffle groups
|
||||
std::shuffle(groups.begin(), groups.end(), common::GlobalRandom());
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& colmat,
|
||||
const tree::TrainParam& param) {
|
||||
cut_ = &gmat.cut;
|
||||
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
const uint32_t nbins = gmat.cut.Ptrs().back();
|
||||
|
||||
/* step 1: form feature groups */
|
||||
auto groups = FastFeatureGrouping(gmat, colmat, param);
|
||||
const auto nblock = static_cast<uint32_t>(groups.size());
|
||||
|
||||
/* step 2: build a new CSR matrix for each feature group */
|
||||
std::vector<uint32_t> bin2block(nbins); // lookup table [bin id] => [block id]
|
||||
for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
|
||||
for (auto& fid : groups[group_id]) {
|
||||
const uint32_t bin_begin = gmat.cut.Ptrs()[fid];
|
||||
const uint32_t bin_end = gmat.cut.Ptrs()[fid + 1];
|
||||
for (uint32_t bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
|
||||
bin2block[bin_id] = group_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<uint32_t>> index_temp(nblock);
|
||||
std::vector<std::vector<size_t>> row_ptr_temp(nblock);
|
||||
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
|
||||
row_ptr_temp[block_id].push_back(0);
|
||||
}
|
||||
for (size_t rid = 0; rid < nrow; ++rid) {
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const uint32_t bin_id = gmat.index[j];
|
||||
const uint32_t block_id = bin2block[bin_id];
|
||||
index_temp[block_id].push_back(bin_id);
|
||||
}
|
||||
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
|
||||
row_ptr_temp[block_id].push_back(index_temp[block_id].size());
|
||||
}
|
||||
}
|
||||
|
||||
/* step 3: concatenate CSR matrices into one (index, row_ptr) pair */
|
||||
std::vector<size_t> index_blk_ptr;
|
||||
std::vector<size_t> row_ptr_blk_ptr;
|
||||
index_blk_ptr.push_back(0);
|
||||
row_ptr_blk_ptr.push_back(0);
|
||||
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
|
||||
index_.insert(index_.end(), index_temp[block_id].begin(), index_temp[block_id].end());
|
||||
row_ptr_.insert(row_ptr_.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
|
||||
index_blk_ptr.push_back(index_.size());
|
||||
row_ptr_blk_ptr.push_back(row_ptr_.size());
|
||||
}
|
||||
|
||||
// save shortcut for each block
|
||||
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
|
||||
Block blk;
|
||||
blk.index_begin = &index_[index_blk_ptr[block_id]];
|
||||
blk.row_ptr_begin = &row_ptr_[row_ptr_blk_ptr[block_id]];
|
||||
blk.index_end = &index_[index_blk_ptr[block_id + 1]];
|
||||
blk.row_ptr_end = &row_ptr_[row_ptr_blk_ptr[block_id + 1]];
|
||||
blocks_.push_back(blk);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief fill a histogram by zeros in range [begin, end)
|
||||
*/
|
||||
@@ -703,71 +442,6 @@ void GHistBuilder<double>::BuildHist(const std::vector<GradientPair>& gpair,
|
||||
GHistRow<double> hist,
|
||||
bool isDense);
|
||||
|
||||
template<typename GradientSumT>
|
||||
void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
GHistRowT hist) {
|
||||
static constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const size_t nblock = gmatb.GetNumBlock();
|
||||
const size_t nrows = row_indices.end - row_indices.begin;
|
||||
const size_t rest = nrows % kUnroll;
|
||||
#if defined(_OPENMP)
|
||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_); // NOLINT
|
||||
#endif // defined(_OPENMP)
|
||||
xgboost::detail::GradientPairInternal<GradientSumT>* p_hist = hist.data();
|
||||
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(guided)
|
||||
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
|
||||
exc.Run([&]() {
|
||||
auto gmat = gmatb[bid];
|
||||
|
||||
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
|
||||
size_t rid[kUnroll];
|
||||
size_t ibegin[kUnroll];
|
||||
size_t iend[kUnroll];
|
||||
GradientPair stat[kUnroll];
|
||||
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
ibegin[k] = gmat.row_ptr[rid[k]];
|
||||
iend[k] = gmat.row_ptr[rid[k] + 1];
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
const GradientPair stat = gpair[rid];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
exc.Rethrow();
|
||||
}
|
||||
template
|
||||
void GHistBuilder<float>::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
GHistRow<float> hist);
|
||||
template
|
||||
void GHistBuilder<double>::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
GHistRow<double> hist);
|
||||
|
||||
|
||||
template<typename GradientSumT>
|
||||
void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
|
||||
GHistRowT sibling,
|
||||
|
||||
@@ -321,48 +321,8 @@ int32_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(bst_uint begin, bst_uint end,
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct GHistIndexBlock {
|
||||
const size_t* row_ptr;
|
||||
const uint32_t* index;
|
||||
|
||||
inline GHistIndexBlock(const size_t* row_ptr, const uint32_t* index)
|
||||
: row_ptr(row_ptr), index(index) {}
|
||||
|
||||
// get i-th row
|
||||
inline GHistIndexRow operator[](size_t i) const {
|
||||
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
|
||||
}
|
||||
};
|
||||
|
||||
class ColumnMatrix;
|
||||
|
||||
class GHistIndexBlockMatrix {
|
||||
public:
|
||||
void Init(const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& colmat,
|
||||
const tree::TrainParam& param);
|
||||
|
||||
inline GHistIndexBlock operator[](size_t i) const {
|
||||
return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
|
||||
}
|
||||
|
||||
inline size_t GetNumBlock() const {
|
||||
return blocks_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<size_t> row_ptr_;
|
||||
std::vector<uint32_t> index_;
|
||||
const HistogramCuts* cut_;
|
||||
struct Block {
|
||||
const size_t* row_ptr_begin;
|
||||
const size_t* row_ptr_end;
|
||||
const uint32_t* index_begin;
|
||||
const uint32_t* index_end;
|
||||
};
|
||||
std::vector<Block> blocks_;
|
||||
};
|
||||
|
||||
template<typename GradientSumT>
|
||||
using GHistRow = Span<xgboost::detail::GradientPairInternal<GradientSumT> >;
|
||||
|
||||
@@ -672,11 +632,6 @@ class GHistBuilder {
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRowT hist,
|
||||
bool isDense);
|
||||
// same, with feature grouping
|
||||
void BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
GHistRowT hist);
|
||||
// construct a histogram via subtraction trick
|
||||
void SubtractionTrick(GHistRowT self,
|
||||
GHistRowT sibling,
|
||||
|
||||
Reference in New Issue
Block a user