Batch UpdatePosition using cudaMemcpy (#7964)
This commit is contained in:
@@ -182,10 +182,11 @@ struct GPUHistMakerDevice {
|
||||
std::unique_ptr<RowPartitioner> row_partitioner;
|
||||
DeviceHistogramStorage<GradientSumT> hist{};
|
||||
|
||||
dh::caching_device_vector<GradientPair> d_gpair; // storage for gpair;
|
||||
dh::device_vector<GradientPair> d_gpair; // storage for gpair;
|
||||
common::Span<GradientPair> gpair;
|
||||
|
||||
dh::caching_device_vector<int> monotone_constraints;
|
||||
dh::device_vector<int> monotone_constraints;
|
||||
dh::device_vector<float> update_predictions;
|
||||
|
||||
/*! \brief Sum gradient for each node. */
|
||||
std::vector<GradientPairPrecise> node_sum_gradients;
|
||||
@@ -356,36 +357,49 @@ struct GPUHistMakerDevice {
|
||||
return true;
|
||||
}
|
||||
|
||||
void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
|
||||
RegTree::Node split_node = (*p_tree)[e.nid];
|
||||
auto split_type = p_tree->NodeSplitType(e.nid);
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
auto node_cats = e.split.split_cats.Bits();
|
||||
// Extra data for each node that is passed
|
||||
// to the update position function
|
||||
struct NodeSplitData {
|
||||
RegTree::Node split_node;
|
||||
FeatureType split_type;
|
||||
common::CatBitField node_cats;
|
||||
};
|
||||
|
||||
row_partitioner->UpdatePosition(
|
||||
e.nid, split_node.LeftChild(), split_node.RightChild(),
|
||||
[=] __device__(bst_uint ridx) {
|
||||
void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
|
||||
if (candidates.empty()) return;
|
||||
std::vector<int> nidx(candidates.size());
|
||||
std::vector<int> left_nidx(candidates.size());
|
||||
std::vector<int> right_nidx(candidates.size());
|
||||
std::vector<NodeSplitData> split_data(candidates.size());
|
||||
for (int i = 0; i < candidates.size(); i++) {
|
||||
auto& e = candidates[i];
|
||||
RegTree::Node split_node = (*p_tree)[e.nid];
|
||||
auto split_type = p_tree->NodeSplitType(e.nid);
|
||||
nidx.at(i) = e.nid;
|
||||
left_nidx.at(i) = split_node.LeftChild();
|
||||
right_nidx.at(i) = split_node.RightChild();
|
||||
split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
|
||||
}
|
||||
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
row_partitioner->UpdatePositionBatch(
|
||||
nidx, left_nidx, right_nidx, split_data,
|
||||
[=] __device__(bst_uint ridx, const NodeSplitData& data) {
|
||||
// given a row index, returns the node id it belongs to
|
||||
bst_float cut_value =
|
||||
d_matrix.GetFvalue(ridx, split_node.SplitIndex());
|
||||
bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
|
||||
// Missing value
|
||||
bst_node_t new_position = 0;
|
||||
bool go_left = true;
|
||||
if (isnan(cut_value)) {
|
||||
new_position = split_node.DefaultChild();
|
||||
go_left = data.split_node.DefaultLeft();
|
||||
} else {
|
||||
bool go_left = true;
|
||||
if (split_type == FeatureType::kCategorical) {
|
||||
go_left = common::Decision<false>(node_cats, cut_value, split_node.DefaultLeft());
|
||||
if (data.split_type == FeatureType::kCategorical) {
|
||||
go_left = common::Decision<false>(data.node_cats.Bits(), cut_value,
|
||||
data.split_node.DefaultLeft());
|
||||
} else {
|
||||
go_left = cut_value <= split_node.SplitCond();
|
||||
}
|
||||
if (go_left) {
|
||||
new_position = split_node.LeftChild();
|
||||
} else {
|
||||
new_position = split_node.RightChild();
|
||||
go_left = cut_value <= data.split_node.SplitCond();
|
||||
}
|
||||
}
|
||||
return new_position;
|
||||
return go_left;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -394,6 +408,16 @@ struct GPUHistMakerDevice {
|
||||
// prediction cache
|
||||
void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
// Prediction cache will not be used with external memory
|
||||
if (!p_fmat->SingleColBlock()) {
|
||||
if (task.UpdateTreeLeaf()) {
|
||||
LOG(FATAL) << "Current objective function can not be used with external memory.";
|
||||
}
|
||||
p_out_position->Resize(0);
|
||||
update_predictions.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
|
||||
d_nodes.size() * sizeof(RegTree::Node),
|
||||
@@ -412,25 +436,9 @@ struct GPUHistMakerDevice {
|
||||
dh::CopyToD(categories_segments, &d_categories_segments);
|
||||
}
|
||||
|
||||
if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
|
||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
|
||||
}
|
||||
if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
|
||||
// see comment in the `FinalisePositionInPage`.
|
||||
LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
|
||||
}
|
||||
if (page->n_rows == p_fmat->Info().num_row_) {
|
||||
FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
|
||||
dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
|
||||
p_out_position);
|
||||
} else {
|
||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
|
||||
FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
|
||||
dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
|
||||
p_out_position);
|
||||
}
|
||||
}
|
||||
FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
|
||||
dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
|
||||
p_out_position);
|
||||
}
|
||||
|
||||
void FinalisePositionInPage(EllpackPageImpl const *page,
|
||||
@@ -438,79 +446,73 @@ struct GPUHistMakerDevice {
|
||||
common::Span<FeatureType const> d_feature_types,
|
||||
common::Span<uint32_t const> categories,
|
||||
common::Span<RegTree::Segment> categories_segments,
|
||||
ObjInfo task,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
auto d_gpair = this->gpair;
|
||||
row_partitioner->FinalisePosition(
|
||||
ctx_, task, p_out_position,
|
||||
[=] __device__(size_t row_id, int position) {
|
||||
// What happens if user prune the tree?
|
||||
if (!d_matrix.IsInRange(row_id)) {
|
||||
return RowPartitioner::kIgnoredTreePosition;
|
||||
}
|
||||
auto node = d_nodes[position];
|
||||
update_predictions.resize(row_partitioner->GetRows().size());
|
||||
auto d_update_predictions = dh::ToSpan(update_predictions);
|
||||
p_out_position->SetDevice(ctx_->gpu_id);
|
||||
p_out_position->Resize(row_partitioner->GetRows().size());
|
||||
|
||||
while (!node.IsLeaf()) {
|
||||
bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
|
||||
// Missing value
|
||||
if (isnan(element)) {
|
||||
position = node.DefaultChild();
|
||||
} else {
|
||||
bool go_left = true;
|
||||
if (common::IsCat(d_feature_types, position)) {
|
||||
auto node_cats =
|
||||
categories.subspan(categories_segments[position].beg,
|
||||
categories_segments[position].size);
|
||||
go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
|
||||
} else {
|
||||
go_left = element <= node.SplitCond();
|
||||
}
|
||||
if (go_left) {
|
||||
position = node.LeftChild();
|
||||
} else {
|
||||
position = node.RightChild();
|
||||
}
|
||||
}
|
||||
node = d_nodes[position];
|
||||
}
|
||||
auto new_position_op = [=] __device__(size_t row_id, int position) {
|
||||
// What happens if user prune the tree?
|
||||
if (!d_matrix.IsInRange(row_id)) {
|
||||
return RowPartitioner::kIgnoredTreePosition;
|
||||
}
|
||||
auto node = d_nodes[position];
|
||||
|
||||
return position;
|
||||
},
|
||||
[d_gpair] __device__(size_t ridx) {
|
||||
// FIXME(jiamingy): Doesn't work when sampling is used with external memory as
|
||||
// the sampler compacts the gradient vector.
|
||||
return d_gpair[ridx].GetHess() - .0f == 0.f;
|
||||
});
|
||||
while (!node.IsLeaf()) {
|
||||
bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
|
||||
// Missing value
|
||||
if (isnan(element)) {
|
||||
position = node.DefaultChild();
|
||||
} else {
|
||||
bool go_left = true;
|
||||
if (common::IsCat(d_feature_types, position)) {
|
||||
auto node_cats = categories.subspan(categories_segments[position].beg,
|
||||
categories_segments[position].size);
|
||||
go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
|
||||
} else {
|
||||
go_left = element <= node.SplitCond();
|
||||
}
|
||||
if (go_left) {
|
||||
position = node.LeftChild();
|
||||
} else {
|
||||
position = node.RightChild();
|
||||
}
|
||||
}
|
||||
|
||||
node = d_nodes[position];
|
||||
}
|
||||
|
||||
d_update_predictions[row_id] = node.LeafValue();
|
||||
return position;
|
||||
}; // NOLINT
|
||||
|
||||
auto d_out_position = p_out_position->DeviceSpan();
|
||||
row_partitioner->FinalisePosition(d_out_position, new_position_op);
|
||||
|
||||
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
|
||||
bst_node_t position = d_out_position[idx];
|
||||
d_update_predictions[idx] = d_nodes[position].LeafValue();
|
||||
bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
|
||||
d_out_position[idx] = is_row_sampled ? ~position : position;
|
||||
});
|
||||
}
|
||||
|
||||
void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
|
||||
bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
|
||||
if (update_predictions.empty()) {
|
||||
return false;
|
||||
}
|
||||
CHECK(p_tree);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
|
||||
auto d_ridx = row_partitioner->GetRows();
|
||||
|
||||
GPUTrainingParam param_d(param);
|
||||
dh::TemporaryArray<GradientPairPrecise> device_node_sum_gradients(node_sum_gradients.size());
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(),
|
||||
sizeof(GradientPairPrecise) * node_sum_gradients.size(),
|
||||
cudaMemcpyHostToDevice));
|
||||
auto d_position = row_partitioner->GetPosition();
|
||||
auto d_node_sum_gradients = device_node_sum_gradients.data().get();
|
||||
auto tree_evaluator = evaluator_.GetEvaluator();
|
||||
|
||||
auto const& h_nodes = p_tree->GetNodes();
|
||||
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
|
||||
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
|
||||
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
|
||||
auto d_nodes = dh::ToSpan(nodes);
|
||||
dh::LaunchN(d_ridx.size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
|
||||
bst_node_t nidx = d_position[idx];
|
||||
auto weight = d_nodes[nidx].LeafValue();
|
||||
out_preds_d(d_ridx[idx]) += weight;
|
||||
auto d_update_predictions = dh::ToSpan(update_predictions);
|
||||
CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
|
||||
dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
|
||||
out_preds_d(idx) += d_update_predictions[idx];
|
||||
});
|
||||
row_partitioner.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
// num histograms is the number of contiguous histograms in memory to reduce over
|
||||
@@ -684,14 +686,12 @@ struct GPUHistMakerDevice {
|
||||
auto new_candidates =
|
||||
pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
|
||||
|
||||
for (const auto& e : filtered_expand_set) {
|
||||
monitor.Start("UpdatePosition");
|
||||
// Update position is only run when child is valid, instead of right after apply
|
||||
// split (as in approx tree method). Hense we have the finalise position call
|
||||
// in GPU Hist.
|
||||
this->UpdatePosition(e, p_tree);
|
||||
monitor.Stop("UpdatePosition");
|
||||
}
|
||||
monitor.Start("UpdatePosition");
|
||||
// Update position is only run when child is valid, instead of right after apply
|
||||
// split (as in approx tree method). Hense we have the finalise position call
|
||||
// in GPU Hist.
|
||||
this->UpdatePosition(filtered_expand_set, p_tree);
|
||||
monitor.Stop("UpdatePosition");
|
||||
|
||||
monitor.Start("BuildHist");
|
||||
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
|
||||
@@ -844,9 +844,9 @@ class GPUHistMaker : public TreeUpdater {
|
||||
return false;
|
||||
}
|
||||
monitor_.Start("UpdatePredictionCache");
|
||||
maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
|
||||
bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
|
||||
monitor_.Stop("UpdatePredictionCache");
|
||||
return true;
|
||||
return result;
|
||||
}
|
||||
|
||||
TrainParam param_; // NOLINT
|
||||
|
||||
Reference in New Issue
Block a user