Refactor parts of fast histogram utilities (#3564)
* Refactor parts of fast histogram utilities * Removed byte packing from column matrix
This commit is contained in:
@@ -8,47 +8,14 @@
|
||||
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
|
||||
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
|
||||
\
|
||||
switch(dtype) { \
|
||||
case xgboost::common::uint32: { \
|
||||
using DType = uint32_t; \
|
||||
OP; \
|
||||
break; \
|
||||
} \
|
||||
case xgboost::common::uint16: { \
|
||||
using DType = uint16_t; \
|
||||
OP; \
|
||||
break; \
|
||||
} \
|
||||
case xgboost::common::uint8: { \
|
||||
using DType = uint8_t; \
|
||||
OP; \
|
||||
break; \
|
||||
default: \
|
||||
LOG(FATAL) << "don't recognize type flag" << dtype; \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
#include <type_traits>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include "hist_util.h"
|
||||
#include "../tree/fast_hist_param.h"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
using tree::FastHistParam;
|
||||
|
||||
/*! \brief indicator of data type used for storing bin id's in a column. */
|
||||
enum DataType {
|
||||
uint8 = 1,
|
||||
uint16 = 2,
|
||||
uint32 = 4
|
||||
};
|
||||
|
||||
/*! \brief column type */
|
||||
enum ColumnType {
|
||||
@@ -58,14 +25,36 @@ enum ColumnType {
|
||||
|
||||
/*! \brief a column storage, to be used with ApplySplit. Note that each
|
||||
bin id is stored as index[i] + index_base. */
|
||||
template<typename T>
|
||||
class Column {
|
||||
public:
|
||||
ColumnType type;
|
||||
const T* index;
|
||||
uint32_t index_base;
|
||||
const size_t* row_ind;
|
||||
size_t len;
|
||||
Column(ColumnType type, const uint32_t* index, uint32_t index_base,
|
||||
const size_t* row_ind, size_t len)
|
||||
: type_(type),
|
||||
index_(index),
|
||||
index_base_(index_base),
|
||||
row_ind_(row_ind),
|
||||
len_(len) {}
|
||||
size_t Size() const { return len_; }
|
||||
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
|
||||
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
|
||||
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
|
||||
// column.GetGlobalBinIdx(idx)
|
||||
uint32_t GetBaseIdx() const { return index_base_; }
|
||||
ColumnType GetType() const { return type_; }
|
||||
size_t GetRowIdx(size_t idx) const {
|
||||
return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx];
|
||||
}
|
||||
bool IsMissing(size_t idx) const {
|
||||
return index_[idx] == std::numeric_limits<uint32_t>::max();
|
||||
}
|
||||
const size_t* GetRowData() const { return row_ind_; }
|
||||
|
||||
private:
|
||||
ColumnType type_;
|
||||
const uint32_t* index_;
|
||||
uint32_t index_base_;
|
||||
const size_t* row_ind_;
|
||||
const size_t len_;
|
||||
};
|
||||
|
||||
/*! \brief a collection of columns, with support for construction from
|
||||
@@ -79,13 +68,8 @@ class ColumnMatrix {
|
||||
|
||||
// construct column matrix from GHistIndexMatrix
|
||||
inline void Init(const GHistIndexMatrix& gmat,
|
||||
const FastHistParam& param) {
|
||||
this->dtype = static_cast<DataType>(param.colmat_dtype);
|
||||
/* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
|
||||
slot of internal buffer. */
|
||||
packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
|
||||
|
||||
const auto nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
|
||||
double sparse_threshold) {
|
||||
const auto nfeature = static_cast<bst_uint>(gmat.cut.row_ptr.size() - 1);
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
|
||||
// identify type of each column
|
||||
@@ -93,19 +77,16 @@ class ColumnMatrix {
|
||||
type_.resize(nfeature);
|
||||
std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
|
||||
|
||||
uint32_t max_val = 0;
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
max_val = static_cast<uint32_t>(std::numeric_limits<DType>::max());
|
||||
});
|
||||
uint32_t max_val = std::numeric_limits<uint32_t>::max();
|
||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||
CHECK_LE(gmat.cut->row_ptr[fid + 1] - gmat.cut->row_ptr[fid], max_val);
|
||||
CHECK_LE(gmat.cut.row_ptr[fid + 1] - gmat.cut.row_ptr[fid], max_val);
|
||||
}
|
||||
|
||||
gmat.GetFeatureCounts(&feature_counts_[0]);
|
||||
// classify features
|
||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||
if (static_cast<double>(feature_counts_[fid])
|
||||
< param.sparse_threshold * nrow) {
|
||||
< sparse_threshold * nrow) {
|
||||
type_[fid] = kSparseColumn;
|
||||
} else {
|
||||
type_[fid] = kDenseColumn;
|
||||
@@ -131,28 +112,23 @@ class ColumnMatrix {
|
||||
boundary_[fid].row_ind_end = accum_row_ind_;
|
||||
}
|
||||
|
||||
index_.resize((boundary_[nfeature - 1].index_end
|
||||
+ (packing_factor_ - 1)) / packing_factor_);
|
||||
index_.resize(boundary_[nfeature - 1].index_end);
|
||||
row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
|
||||
|
||||
// store least bin id for each feature
|
||||
index_base_.resize(nfeature);
|
||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||
index_base_[fid] = gmat.cut->row_ptr[fid];
|
||||
index_base_[fid] = gmat.cut.row_ptr[fid];
|
||||
}
|
||||
|
||||
// pre-fill index_ for dense columns
|
||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
const size_t ibegin = boundary_[fid].index_begin;
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
const size_t block_offset = ibegin / packing_factor_;
|
||||
const size_t elem_offset = ibegin % packing_factor_;
|
||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
||||
DType* end = begin + nrow;
|
||||
std::fill(begin, end, std::numeric_limits<DType>::max());
|
||||
// max() indicates missing values
|
||||
});
|
||||
uint32_t* begin = &index_[ibegin];
|
||||
uint32_t* end = begin + nrow;
|
||||
std::fill(begin, end, std::numeric_limits<uint32_t>::max());
|
||||
// max() indicates missing values
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,23 +143,15 @@ class ColumnMatrix {
|
||||
size_t fid = 0;
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
const uint32_t bin_id = gmat.index[i];
|
||||
while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
|
||||
while (bin_id >= gmat.cut.row_ptr[fid + 1]) {
|
||||
++fid;
|
||||
}
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
||||
begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
|
||||
});
|
||||
uint32_t* begin = &index_[boundary_[fid].index_begin];
|
||||
begin[rid] = bin_id - index_base_[fid];
|
||||
} else {
|
||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
||||
begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
|
||||
});
|
||||
uint32_t* begin = &index_[boundary_[fid].index_begin];
|
||||
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
|
||||
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
|
||||
++num_nonzeros[fid];
|
||||
}
|
||||
@@ -193,29 +161,13 @@ class ColumnMatrix {
|
||||
|
||||
/* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
|
||||
to determine type of bin id's */
|
||||
template<typename T>
|
||||
inline Column<T> GetColumn(unsigned fid) const {
|
||||
const bool valid_type = std::is_same<T, uint32_t>::value
|
||||
|| std::is_same<T, uint16_t>::value
|
||||
|| std::is_same<T, uint8_t>::value;
|
||||
CHECK(valid_type);
|
||||
|
||||
Column<T> c;
|
||||
|
||||
c.type = type_[fid];
|
||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
||||
c.index = reinterpret_cast<const T*>(&index_[block_offset]) + elem_offset;
|
||||
c.index_base = index_base_[fid];
|
||||
c.row_ind = &row_ind_[boundary_[fid].row_ind_begin];
|
||||
c.len = boundary_[fid].index_end - boundary_[fid].index_begin;
|
||||
|
||||
inline Column GetColumn(unsigned fid) const {
|
||||
Column c(type_[fid], &index_[boundary_[fid].index_begin], index_base_[fid],
|
||||
&row_ind_[boundary_[fid].row_ind_begin],
|
||||
boundary_[fid].index_end - boundary_[fid].index_begin);
|
||||
return c;
|
||||
}
|
||||
|
||||
public:
|
||||
DataType dtype;
|
||||
|
||||
private:
|
||||
struct ColumnBoundary {
|
||||
// indicate where each column's index and row_ind is stored.
|
||||
@@ -233,8 +185,6 @@ class ColumnMatrix {
|
||||
std::vector<size_t> row_ind_;
|
||||
std::vector<ColumnBoundary> boundary_;
|
||||
|
||||
size_t packing_factor_; // how many integers are stored in each slot of index_
|
||||
|
||||
// index_base_[fid]: least bin id for feature fid
|
||||
std::vector<uint32_t> index_base_;
|
||||
};
|
||||
|
||||
@@ -114,12 +114,23 @@ void HistCutMatrix::Init
|
||||
}
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
CHECK(cut != nullptr); // NOLINT
|
||||
uint32_t HistCutMatrix::GetBinIdx(const Entry& e) {
|
||||
unsigned fid = e.index;
|
||||
auto cbegin = cut.begin() + row_ptr[fid];
|
||||
auto cend = cut.begin() + row_ptr[fid + 1];
|
||||
CHECK(cbegin != cend);
|
||||
auto it = std::upper_bound(cbegin, cend, e.fvalue);
|
||||
if (it == cend) it = cend - 1;
|
||||
uint32_t idx = static_cast<uint32_t>(it - cut.begin());
|
||||
return idx;
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
|
||||
cut.Init(p_fmat, max_num_bins);
|
||||
auto iter = p_fmat->RowIterator();
|
||||
|
||||
const int nthread = omp_get_max_threads();
|
||||
const uint32_t nbins = cut->row_ptr.back();
|
||||
const uint32_t nbins = cut.row_ptr.back();
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_tloc_.resize(nthread * nbins, 0);
|
||||
|
||||
@@ -133,8 +144,8 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
}
|
||||
index.resize(row_ptr.back());
|
||||
|
||||
CHECK_GT(cut->cut.size(), 0U);
|
||||
CHECK_EQ(cut->row_ptr.back(), cut->cut.size());
|
||||
CHECK_GT(cut.cut.size(), 0U);
|
||||
CHECK_EQ(cut.row_ptr.back(), cut.cut.size());
|
||||
|
||||
auto bsize = static_cast<omp_ulong>(batch.Size());
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
@@ -145,13 +156,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
SparsePage::Inst inst = batch[i];
|
||||
CHECK_EQ(ibegin + inst.length, iend);
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
unsigned fid = inst[j].index;
|
||||
auto cbegin = cut->cut.begin() + cut->row_ptr[fid];
|
||||
auto cend = cut->cut.begin() + cut->row_ptr[fid + 1];
|
||||
CHECK(cbegin != cend);
|
||||
auto it = std::upper_bound(cbegin, cend, inst[j].fvalue);
|
||||
if (it == cend) it = cend - 1;
|
||||
uint32_t idx = static_cast<uint32_t>(it - cut->cut.begin());
|
||||
uint32_t idx = cut.GetBinIdx(inst[j]);
|
||||
index[ibegin + j] = idx;
|
||||
++hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
@@ -167,14 +172,13 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static size_t GetConflictCount(const std::vector<bool>& mark,
|
||||
const Column<T>& column,
|
||||
const Column& column,
|
||||
size_t max_cnt) {
|
||||
size_t ret = 0;
|
||||
if (column.type == xgboost::common::kDenseColumn) {
|
||||
for (size_t i = 0; i < column.len; ++i) {
|
||||
if (column.index[i] != std::numeric_limits<T>::max() && mark[i]) {
|
||||
if (column.GetType() == xgboost::common::kDenseColumn) {
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
if (column.GetFeatureBinIdx(i) != std::numeric_limits<uint32_t>::max() && mark[i]) {
|
||||
++ret;
|
||||
if (ret > max_cnt) {
|
||||
return max_cnt + 1;
|
||||
@@ -182,8 +186,8 @@ static size_t GetConflictCount(const std::vector<bool>& mark,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < column.len; ++i) {
|
||||
if (mark[column.row_ind[i]]) {
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
if (mark[column.GetRowIdx(i)]) {
|
||||
++ret;
|
||||
if (ret > max_cnt) {
|
||||
return max_cnt + 1;
|
||||
@@ -194,30 +198,28 @@ static size_t GetConflictCount(const std::vector<bool>& mark,
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void
|
||||
MarkUsed(std::vector<bool>* p_mark, const Column<T>& column) {
|
||||
MarkUsed(std::vector<bool>* p_mark, const Column& column) {
|
||||
std::vector<bool>& mark = *p_mark;
|
||||
if (column.type == xgboost::common::kDenseColumn) {
|
||||
for (size_t i = 0; i < column.len; ++i) {
|
||||
if (column.index[i] != std::numeric_limits<T>::max()) {
|
||||
if (column.GetType() == xgboost::common::kDenseColumn) {
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
if (column.GetFeatureBinIdx(i) != std::numeric_limits<uint32_t>::max()) {
|
||||
mark[i] = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < column.len; ++i) {
|
||||
mark[column.row_ind[i]] = true;
|
||||
for (size_t i = 0; i < column.Size(); ++i) {
|
||||
mark[column.GetRowIdx(i)] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::vector<std::vector<unsigned>>
|
||||
FindGroups_(const std::vector<unsigned>& feature_list,
|
||||
const std::vector<size_t>& feature_nnz,
|
||||
const ColumnMatrix& colmat,
|
||||
size_t nrow,
|
||||
const FastHistParam& param) {
|
||||
FindGroups(const std::vector<unsigned>& feature_list,
|
||||
const std::vector<size_t>& feature_nnz,
|
||||
const ColumnMatrix& colmat,
|
||||
size_t nrow,
|
||||
const FastHistParam& param) {
|
||||
/* Goal: Bundle features together that has little or no "overlap", i.e.
|
||||
only a few data points should have nonzero values for
|
||||
member features.
|
||||
@@ -231,7 +233,7 @@ FindGroups_(const std::vector<unsigned>& feature_list,
|
||||
= static_cast<size_t>(param.max_conflict_rate * nrow);
|
||||
|
||||
for (auto fid : feature_list) {
|
||||
const Column<T>& column = colmat.GetColumn<T>(fid);
|
||||
const Column& column = colmat.GetColumn(fid);
|
||||
|
||||
const size_t cur_fid_nnz = feature_nnz[fid];
|
||||
bool need_new_group = true;
|
||||
@@ -276,24 +278,12 @@ FindGroups_(const std::vector<unsigned>& feature_list,
|
||||
return groups;
|
||||
}
|
||||
|
||||
inline std::vector<std::vector<unsigned>>
|
||||
FindGroups(const std::vector<unsigned>& feature_list,
|
||||
const std::vector<size_t>& feature_nnz,
|
||||
const ColumnMatrix& colmat,
|
||||
size_t nrow,
|
||||
const FastHistParam& param) {
|
||||
XGBOOST_TYPE_SWITCH(colmat.dtype, {
|
||||
return FindGroups_<DType>(feature_list, feature_nnz, colmat, nrow, param);
|
||||
});
|
||||
return std::vector<std::vector<unsigned>>(); // to avoid warning message
|
||||
}
|
||||
|
||||
inline std::vector<std::vector<unsigned>>
|
||||
FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& colmat,
|
||||
const FastHistParam& param) {
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
const size_t nfeature = gmat.cut->row_ptr.size() - 1;
|
||||
const size_t nfeature = gmat.cut.row_ptr.size() - 1;
|
||||
|
||||
std::vector<unsigned> feature_list(nfeature);
|
||||
std::iota(feature_list.begin(), feature_list.end(), 0);
|
||||
@@ -346,10 +336,10 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
||||
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& colmat,
|
||||
const FastHistParam& param) {
|
||||
cut_ = gmat.cut;
|
||||
cut_ = &gmat.cut;
|
||||
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
const uint32_t nbins = gmat.cut->row_ptr.back();
|
||||
const uint32_t nbins = gmat.cut.row_ptr.back();
|
||||
|
||||
/* step 1: form feature groups */
|
||||
auto groups = FastFeatureGrouping(gmat, colmat, param);
|
||||
@@ -359,8 +349,8 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||
std::vector<uint32_t> bin2block(nbins); // lookup table [bin id] => [block id]
|
||||
for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
|
||||
for (auto& fid : groups[group_id]) {
|
||||
const uint32_t bin_begin = gmat.cut->row_ptr[fid];
|
||||
const uint32_t bin_end = gmat.cut->row_ptr[fid + 1];
|
||||
const uint32_t bin_begin = gmat.cut.row_ptr[fid];
|
||||
const uint32_t bin_end = gmat.cut.row_ptr[fid + 1];
|
||||
for (uint32_t bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
|
||||
bin2block[bin_id] = group_id;
|
||||
}
|
||||
|
||||
@@ -75,6 +75,7 @@ struct HistCutMatrix {
|
||||
std::vector<bst_float> min_val;
|
||||
/*! \brief the cut field */
|
||||
std::vector<bst_float> cut;
|
||||
uint32_t GetBinIdx(const Entry &e);
|
||||
/*! \brief Get histogram bound for fid */
|
||||
inline HistCutUnit operator[](bst_uint fid) const {
|
||||
return {dmlc::BeginPtr(cut) + row_ptr[fid],
|
||||
@@ -122,18 +123,18 @@ struct GHistIndexMatrix {
|
||||
/*! \brief hit count of each index */
|
||||
std::vector<size_t> hit_count;
|
||||
/*! \brief The corresponding cuts */
|
||||
const HistCutMatrix* cut;
|
||||
HistCutMatrix cut;
|
||||
// Create a global histogram matrix, given cut
|
||||
void Init(DMatrix* p_fmat);
|
||||
void Init(DMatrix* p_fmat, int max_num_bins);
|
||||
// get i-th row
|
||||
inline GHistIndexRow operator[](size_t i) const {
|
||||
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
|
||||
}
|
||||
inline void GetFeatureCounts(size_t* counts) const {
|
||||
auto nfeature = cut->row_ptr.size() - 1;
|
||||
auto nfeature = cut.row_ptr.size() - 1;
|
||||
for (unsigned fid = 0; fid < nfeature; ++fid) {
|
||||
auto ibegin = cut->row_ptr[fid];
|
||||
auto iend = cut->row_ptr[fid + 1];
|
||||
auto ibegin = cut.row_ptr[fid];
|
||||
auto iend = cut.row_ptr[fid + 1];
|
||||
for (auto i = ibegin; i < iend; ++i) {
|
||||
counts[fid] += hit_count[i];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user