Thread local memory allocation for BuildHist (#6358)

* thread mem locality

* fix apply

* cleanup

* fix lint

* fix tests

* simple try

* fix

* fix

* apply comments

* fix comments

* fix

* apply simple comment

Co-authored-by: ShvetsKS <kirill.shvets@intel.com>
This commit is contained in:
ShvetsKS
2020-11-25 17:50:12 +03:00
committed by GitHub
parent 4dbbeb635d
commit 956beead70
6 changed files with 89 additions and 47 deletions

View File

@@ -407,9 +407,14 @@ class HistCollection {
// access histogram for i-th node
GHistRowT operator[](bst_uint nid) const {
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
CHECK_NE(row_ptr_[nid], kMax);
GradientPairT* ptr =
const_cast<GradientPairT*>(dmlc::BeginPtr(data_) + row_ptr_[nid]);
const size_t id = row_ptr_[nid];
CHECK_NE(id, kMax);
GradientPairT* ptr = nullptr;
if (contiguous_allocation_) {
ptr = const_cast<GradientPairT*>(data_[0].data() + nbins_*id);
} else {
ptr = const_cast<GradientPairT*>(data_[id].data());
}
return {ptr, nbins_};
}
@@ -438,21 +443,37 @@ class HistCollection {
}
CHECK_EQ(row_ptr_[nid], kMax);
if (data_.size() < nbins_ * (nid + 1)) {
data_.resize(nbins_ * (nid + 1));
if (data_.size() < (nid + 1)) {
data_.resize((nid + 1));
}
row_ptr_[nid] = nbins_ * n_nodes_added_;
row_ptr_[nid] = n_nodes_added_;
n_nodes_added_++;
}
// allocate thread local memory i-th node
void AllocateData(bst_uint nid) {
if (data_[row_ptr_[nid]].size() == 0) {
data_[row_ptr_[nid]].resize(nbins_, {0, 0});
}
}
// allocate common buffer contiguously for all nodes, need for single Allreduce call
void AllocateAllData() {
const size_t new_size = nbins_*data_.size();
contiguous_allocation_ = true;
if (data_[0].size() != new_size) {
data_[0].resize(new_size);
}
}
private:
/*! \brief number of all bins over all features */
uint32_t nbins_ = 0;
/*! \brief amount of active nodes in hist collection */
uint32_t n_nodes_added_ = 0;
/*! \brief flag to identify contiguous memory allocation */
bool contiguous_allocation_ = false;
std::vector<GradientPairT> data_;
std::vector<std::vector<GradientPairT>> data_;
/*! \brief row_ptr_[nid] locates bin for histogram of node nid */
std::vector<size_t> row_ptr_;
@@ -481,7 +502,6 @@ class ParallelGHistBuilder {
const std::vector<GHistRowT>& targeted_hists) {
hist_buffer_.Init(nbins_);
tid_nid_to_hist_.clear();
hist_memory_.clear();
threads_to_nids_map_.clear();
targeted_hists_ = targeted_hists;
@@ -504,8 +524,11 @@ class ParallelGHistBuilder {
CHECK_LT(nid, nodes_);
CHECK_LT(tid, nthreads_);
size_t idx = tid_nid_to_hist_.at({tid, nid});
GHistRowT hist = hist_memory_[idx];
int idx = tid_nid_to_hist_.at({tid, nid});
if (idx >= 0) {
hist_buffer_.AllocateData(idx);
}
GHistRowT hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
if (!hist_was_used_[tid * nodes_ + nid]) {
InitilizeHistByZeroes(hist, 0, hist.size());
@@ -526,8 +549,9 @@ class ParallelGHistBuilder {
for (size_t tid = 0; tid < nthreads_; ++tid) {
if (hist_was_used_[tid * nodes_ + nid]) {
is_updated = true;
const size_t idx = tid_nid_to_hist_.at({tid, nid});
GHistRowT src = hist_memory_[idx];
int idx = tid_nid_to_hist_.at({tid, nid});
GHistRowT src = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
if (dst.data() != src.data()) {
IncrementHist(dst, src, begin, end);
@@ -589,7 +613,6 @@ class ParallelGHistBuilder {
}
void MatchNodeNidPairToHist() {
size_t hist_total = 0;
size_t hist_allocated_additionally = 0;
for (size_t nid = 0; nid < nodes_; ++nid) {
@@ -597,15 +620,11 @@ class ParallelGHistBuilder {
for (size_t tid = 0; tid < nthreads_; ++tid) {
if (threads_to_nids_map_[tid * nodes_ + nid]) {
if (first_hist) {
hist_memory_.push_back(targeted_hists_[nid]);
tid_nid_to_hist_[{tid, nid}] = -1;
first_hist = false;
} else {
hist_memory_.push_back(hist_buffer_[hist_allocated_additionally]);
hist_allocated_additionally++;
tid_nid_to_hist_[{tid, nid}] = hist_allocated_additionally++;
}
// map pair {tid, nid} to index of allocated histogram from hist_memory_
tid_nid_to_hist_[{tid, nid}] = hist_total++;
CHECK_EQ(hist_total, hist_memory_.size());
}
}
}
@@ -630,10 +649,11 @@ class ParallelGHistBuilder {
std::vector<bool> threads_to_nids_map_;
/*! \brief Contains histograms for final results */
std::vector<GHistRowT> targeted_hists_;
/*! \brief Allocated memory for histograms used for construction */
std::vector<GHistRowT> hist_memory_;
/*! \brief map pair {tid, nid} to index of allocated histogram from hist_memory_ */
std::map<std::pair<size_t, size_t>, size_t> tid_nid_to_hist_;
/*!
* \brief map pair {tid, nid} to index of allocated histogram from hist_buffer_ and targeted_hists_,
* -1 is reserved for targeted_hists_
*/
std::map<std::pair<size_t, size_t>, int> tid_nid_to_hist_;
};
/*!