Support column split in GPU evaluate splits (#9511)
This commit is contained in:
@@ -5,8 +5,8 @@
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#include "../../collective/communicator-inl.cuh"
|
||||
#include "../../common/categorical.h"
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "expand_entry.cuh"
|
||||
@@ -409,6 +409,23 @@ void GPUHistEvaluator::EvaluateSplits(
|
||||
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
|
||||
evaluator, out_splits);
|
||||
|
||||
if (is_column_split_) {
|
||||
// With column-wise data split, we gather the split candidates from all the workers and find the
|
||||
// global best candidates.
|
||||
auto const world_size = collective::GetWorldSize();
|
||||
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
|
||||
auto all_candidates = dh::ToSpan(all_candidate_storage);
|
||||
collective::AllGather(device_, out_splits.data(), all_candidates.data(),
|
||||
out_splits.size() * sizeof(DeviceSplitCandidate));
|
||||
|
||||
// Reduce to get the best candidate from all workers.
|
||||
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
|
||||
for (auto rank = 0; rank < world_size; rank++) {
|
||||
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||
auto d_entries = out_entries;
|
||||
auto device_cats_accessor = this->DeviceCatStorage(nidx);
|
||||
|
||||
@@ -83,6 +83,9 @@ class GPUHistEvaluator {
|
||||
// Number of elements of categorical storage type
|
||||
// needed to hold categoricals for a single mode
|
||||
std::size_t node_categorical_storage_size_ = 0;
|
||||
// Is the data split column-wise?
|
||||
bool is_column_split_ = false;
|
||||
int32_t device_;
|
||||
|
||||
// Copy the categories from device to host asynchronously.
|
||||
void CopyToHost( const std::vector<bst_node_t>& nidx);
|
||||
@@ -136,7 +139,8 @@ class GPUHistEvaluator {
|
||||
* \brief Reset the evaluator, should be called before any use.
|
||||
*/
|
||||
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m, int32_t device);
|
||||
bst_feature_t n_features, TrainParam const ¶m, bool is_column_split,
|
||||
int32_t device);
|
||||
|
||||
/**
|
||||
* \brief Get host category storage for nidx. Different from the internal version, this
|
||||
|
||||
@@ -14,10 +14,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
|
||||
common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m,
|
||||
int32_t device) {
|
||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m,
|
||||
bool is_column_split, int32_t device) {
|
||||
param_ = param;
|
||||
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
||||
has_categoricals_ = cuts.HasCategorical();
|
||||
@@ -65,6 +64,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
|
||||
return fidx;
|
||||
});
|
||||
}
|
||||
is_column_split_ = is_column_split;
|
||||
device_ = device;
|
||||
}
|
||||
|
||||
common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
||||
|
||||
@@ -242,7 +242,8 @@ struct GPUHistMakerDevice {
|
||||
page = sample.page;
|
||||
gpair = sample.gpair;
|
||||
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
||||
dmat->Info().IsColumnSplit(), ctx_->gpu_id);
|
||||
|
||||
quantiser.reset(new GradientQuantiser(this->gpair));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user