Reduce device synchronisation (#5631)

* Reduce device synchronisation

* Initialise pinned memory
This commit is contained in:
Rory Mitchell
2020-05-07 21:19:46 +12:00
committed by GitHub
parent 9910265064
commit fcf57823b6
7 changed files with 260 additions and 118 deletions

View File

@@ -0,0 +1,120 @@
/*!
* Copyright 2020 by XGBoost Contributors
*/
#ifndef DRIVER_CUH_
#define DRIVER_CUH_
#include <xgboost/span.h>
#include <queue>
#include "../param.h"
#include "evaluate_splits.cuh"
namespace xgboost {
namespace tree {
struct ExpandEntry {
int nid;
int depth;
DeviceSplitCandidate split;
ExpandEntry() = default;
XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split)
: nid(nid), depth(depth), split(std::move(split)) {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
return false;
}
if (split.loss_chg < param.min_split_loss) {
return false;
}
if (param.max_depth > 0 && depth == param.max_depth) {
return false;
}
if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
return false;
}
return true;
}
static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
if (param.max_depth > 0 && depth >= param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
return true;
}
friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
os << "ExpandEntry: \n";
os << "nidx: " << e.nid << "\n";
os << "depth: " << e.depth << "\n";
os << "loss: " << e.split.loss_chg << "\n";
os << "left_sum: " << e.split.left_sum << "\n";
os << "right_sum: " << e.split.right_sum << "\n";
return os;
}
};
inline bool DepthWise(const ExpandEntry& lhs, const ExpandEntry& rhs) {
return lhs.depth > rhs.depth; // favor small depth
}
inline bool LossGuide(const ExpandEntry& lhs, const ExpandEntry& rhs) {
if (lhs.split.loss_chg == rhs.split.loss_chg) {
return lhs.nid > rhs.nid; // favor small timestamp
} else {
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
}
}
// Drives execution of tree building on device
class Driver {
using ExpandQueue =
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
std::function<bool(ExpandEntry, ExpandEntry)>>;
public:
explicit Driver(TrainParam::TreeGrowPolicy policy)
: policy_(policy),
queue_(policy == TrainParam::kDepthWise ? DepthWise : LossGuide) {}
template <typename EntryIterT>
void Push(EntryIterT begin,EntryIterT end) {
for (auto it = begin; it != end; ++it) {
const ExpandEntry& e = *it;
if (e.split.loss_chg > kRtEps) {
queue_.push(e);
}
}
}
void Push(const std::vector<ExpandEntry> &entries) {
this->Push(entries.begin(), entries.end());
}
// Return the set of nodes to be expanded
// This set has no dependencies between entries so they may be expanded in
// parallel or asynchronously
std::vector<ExpandEntry> Pop() {
if (queue_.empty()) return {};
// Return a single entry for loss guided mode
if (policy_ == TrainParam::kLossGuide) {
ExpandEntry e = queue_.top();
queue_.pop();
return {e};
}
// Return nodes on same level for depth wise
std::vector<ExpandEntry> result;
ExpandEntry e = queue_.top();
int level = e.depth;
while (e.depth == level && !queue_.empty()) {
queue_.pop();
result.emplace_back(e);
if (!queue_.empty()) {
e = queue_.top();
}
}
return result;
}
private:
TrainParam::TreeGrowPolicy policy_;
ExpandQueue queue_;
};
} // namespace tree
} // namespace xgboost
#endif // DRIVER_CUH_

View File

@@ -61,6 +61,7 @@ class RowPartitioner {
dh::caching_device_vector<int64_t>
left_counts_; // Useful to keep a bunch of zeroed memory for sort position
std::vector<cudaStream_t> streams_;
dh::PinnedMemory pinned_;
public:
RowPartitioner(int device_idx, size_t num_rows);
@@ -129,12 +130,12 @@ class RowPartitioner {
d_position[idx] = new_position;
});
// Overlap device to host memory copy (left_count) with sort
int64_t left_count;
int64_t &left_count = pinned_.GetSpan<int64_t>(1)[0];
dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t),
cudaMemcpyDeviceToHost, streams_[0]));
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count,
streams_[1]);
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]
);
dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
CHECK_LE(left_count, segment.Size());