Fixed the performance regression within EvaluateSplits(). (#3680)
- it turns out creating an std::vector on every call is faster than cudaMallocHost()/cudaFreeHost()
This commit is contained in:
parent
beab6e08dd
commit
f606cb8ef4
@ -747,6 +747,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
struct ExpandEntry;
|
struct ExpandEntry;
|
||||||
|
|
||||||
GPUHistMaker() : initialised_(false), p_last_fmat_(nullptr) {}
|
GPUHistMaker() : initialised_(false), p_last_fmat_(nullptr) {}
|
||||||
|
|
||||||
void Init(
|
void Init(
|
||||||
const std::vector<std::pair<std::string, std::string>>& args) override {
|
const std::vector<std::pair<std::string, std::string>>& args) override {
|
||||||
param_.InitAllowUnknown(args);
|
param_.InitAllowUnknown(args);
|
||||||
@ -919,9 +920,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
const std::vector<int>& nidx_set, RegTree* p_tree) {
|
const std::vector<int>& nidx_set, RegTree* p_tree) {
|
||||||
auto columns = info_->num_col_;
|
auto columns = info_->num_col_;
|
||||||
std::vector<DeviceSplitCandidate> best_splits(nidx_set.size());
|
std::vector<DeviceSplitCandidate> best_splits(nidx_set.size());
|
||||||
DeviceSplitCandidate* candidate_splits;
|
std::vector<DeviceSplitCandidate> candidate_splits(nidx_set.size() * columns);
|
||||||
dh::safe_cuda(cudaMallocHost(&candidate_splits, nidx_set.size() *
|
|
||||||
columns * sizeof(DeviceSplitCandidate)));
|
|
||||||
// Use first device
|
// Use first device
|
||||||
auto& shard = shards_.front();
|
auto& shard = shards_.front();
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
||||||
@ -952,10 +951,10 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
dh::safe_cuda(
|
dh::safe_cuda
|
||||||
cudaMemcpy(candidate_splits, shard->temp_memory.d_temp_storage,
|
(cudaMemcpy(candidate_splits.data(), shard->temp_memory.d_temp_storage,
|
||||||
sizeof(DeviceSplitCandidate) * columns * nidx_set.size(),
|
sizeof(DeviceSplitCandidate) * columns * nidx_set.size(),
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
for (auto i = 0; i < nidx_set.size(); i++) {
|
for (auto i = 0; i < nidx_set.size(); i++) {
|
||||||
auto depth = p_tree->GetDepth(nidx_set[i]);
|
auto depth = p_tree->GetDepth(nidx_set[i]);
|
||||||
DeviceSplitCandidate nidx_best;
|
DeviceSplitCandidate nidx_best;
|
||||||
@ -965,7 +964,6 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
best_splits[i] = nidx_best;
|
best_splits[i] = nidx_best;
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaFreeHost(candidate_splits));
|
|
||||||
return std::move(best_splits);
|
return std::move(best_splits);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user