Merge lossgude and depthwise strategies for CPU hist (#7007)

* fix java/scala test: max depth is also valid parameter for lossguide

Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
This commit is contained in:
ShvetsKS
2021-06-02 20:49:43 +03:00
committed by GitHub
parent ee4f51a631
commit 57c732655e
11 changed files with 415 additions and 484 deletions

View File

@@ -25,6 +25,7 @@
#include "../data/ellpack_page.cuh"
#include "param.h"
#include "driver.h"
#include "updater_gpu_common.cuh"
#include "split_evaluator.h"
#include "constraints.cuh"
@@ -33,7 +34,7 @@
#include "gpu_hist/row_partitioner.cuh"
#include "gpu_hist/histogram.cuh"
#include "gpu_hist/evaluate_splits.cuh"
#include "gpu_hist/driver.cuh"
#include "gpu_hist/expand_entry.cuh"
namespace xgboost {
namespace tree {
@@ -321,8 +322,8 @@ struct GPUHistMakerDevice {
}
void EvaluateLeftRightSplits(
ExpandEntry candidate, int left_nidx, int right_nidx, const RegTree& tree,
common::Span<ExpandEntry> pinned_candidates_out) {
GPUExpandEntry candidate, int left_nidx, int right_nidx, const RegTree& tree,
common::Span<GPUExpandEntry> pinned_candidates_out) {
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
GPUTrainingParam gpu_param(param);
auto left_sampled_features =
@@ -363,7 +364,7 @@ struct GPUHistMakerDevice {
hist.GetNodeHistogram(right_nidx)};
auto d_splits_out = dh::ToSpan(splits_out);
EvaluateSplits(d_splits_out, tree_evaluator.GetEvaluator<GPUTrainingParam>(), left, right);
dh::TemporaryArray<ExpandEntry> entries(2);
dh::TemporaryArray<GPUExpandEntry> entries(2);
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
auto d_entries = entries.data().get();
dh::LaunchN(device_id, 2, [=] __device__(size_t idx) {
@@ -378,12 +379,12 @@ struct GPUHistMakerDevice {
nidx, gpu_param, GradStats{split.right_sum});
d_entries[idx] =
ExpandEntry{nidx, candidate.depth + 1, d_splits_out[idx],
GPUExpandEntry{nidx, candidate.depth + 1, d_splits_out[idx],
base_weight, left_weight, right_weight};
});
dh::safe_cuda(cudaMemcpyAsync(
pinned_candidates_out.data(), entries.data().get(),
sizeof(ExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
}
void BuildHist(int nidx) {
@@ -569,7 +570,7 @@ struct GPUHistMakerDevice {
/**
* \brief Build GPU local histograms for the left and right child of some parent node
*/
void BuildHistLeftRight(const ExpandEntry &candidate, int nidx_left,
void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left,
int nidx_right, dh::AllReducer* reducer) {
auto build_hist_nidx = nidx_left;
auto subtraction_trick_nidx = nidx_right;
@@ -599,7 +600,7 @@ struct GPUHistMakerDevice {
}
}
void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) {
RegTree& tree = *p_tree;
auto evaluator = tree_evaluator.GetEvaluator();
auto parent_sum = candidate.split.left_sum + candidate.split.right_sum;
@@ -647,7 +648,7 @@ struct GPUHistMakerDevice {
tree[candidate.nid].RightChild());
}
ExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
constexpr bst_node_t kRootNIdx = 0;
dh::XGBCachingDeviceAllocator<char> alloc;
GradientPair root_sum = dh::Reduce(
@@ -670,7 +671,7 @@ struct GPUHistMakerDevice {
// Generate first split
auto split = this->EvaluateRootSplit(root_sum);
dh::TemporaryArray<ExpandEntry> entries(1);
dh::TemporaryArray<GPUExpandEntry> entries(1);
auto d_entries = entries.data().get();
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
GPUTrainingParam gpu_param(param);
@@ -681,20 +682,20 @@ struct GPUHistMakerDevice {
float right_weight = evaluator.CalcWeight(
kRootNIdx, gpu_param, GradStats{split.right_sum});
d_entries[0] =
ExpandEntry(kRootNIdx, depth, split,
GPUExpandEntry(kRootNIdx, depth, split,
weight, left_weight, right_weight);
});
ExpandEntry root_entry;
GPUExpandEntry root_entry;
dh::safe_cuda(cudaMemcpyAsync(
&root_entry, entries.data().get(),
sizeof(ExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
return root_entry;
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
RegTree* p_tree, dh::AllReducer* reducer) {
auto& tree = *p_tree;
Driver driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
monitor.Start("Reset");
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
@@ -710,7 +711,7 @@ struct GPUHistMakerDevice {
auto expand_set = driver.Pop();
while (!expand_set.empty()) {
auto new_candidates =
pinned.GetSpan<ExpandEntry>(expand_set.size() * 2, ExpandEntry());
pinned.GetSpan<GPUExpandEntry>(expand_set.size() * 2, GPUExpandEntry());
for (auto i = 0ull; i < expand_set.size(); i++) {
auto candidate = expand_set.at(i);
@@ -724,7 +725,7 @@ struct GPUHistMakerDevice {
int left_child_nidx = tree[candidate.nid].LeftChild();
int right_child_nidx = tree[candidate.nid].RightChild();
// Only create child entries if needed
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
num_leaves)) {
monitor.Start("UpdatePosition");
this->UpdatePosition(candidate.nid, p_tree);
@@ -741,8 +742,8 @@ struct GPUHistMakerDevice {
monitor.Stop("EvaluateSplits");
} else {
// Set default
new_candidates[i * 2] = ExpandEntry();
new_candidates[i * 2 + 1] = ExpandEntry();
new_candidates[i * 2] = GPUExpandEntry();
new_candidates[i * 2 + 1] = GPUExpandEntry();
}
}
dh::safe_cuda(cudaDeviceSynchronize());