/** * Copyright 2017-2023 by XGBoost contributors */ #include #include #include #include #include #include // for size_t #include // for unique_ptr, make_unique #include // for move #include #include "../collective/communicator-inl.cuh" #include "../common/bitfield.h" #include "../common/categorical.h" #include "../common/cuda_context.cuh" // CUDAContext #include "../common/device_helpers.cuh" #include "../common/hist_util.h" #include "../common/io.h" #include "../common/timer.h" #include "../data/ellpack_page.cuh" #include "../data/ellpack_page.h" #include "constraints.cuh" #include "driver.h" #include "gpu_hist/evaluate_splits.cuh" #include "gpu_hist/expand_entry.cuh" #include "gpu_hist/feature_groups.cuh" #include "gpu_hist/gradient_based_sampler.cuh" #include "gpu_hist/histogram.cuh" #include "gpu_hist/row_partitioner.cuh" #include "hist/param.h" #include "param.h" #include "updater_gpu_common.cuh" #include "xgboost/base.h" #include "xgboost/context.h" #include "xgboost/data.h" #include "xgboost/host_device_vector.h" #include "xgboost/json.h" #include "xgboost/parameter.h" #include "xgboost/span.h" #include "xgboost/task.h" // for ObjInfo #include "xgboost/tree_model.h" namespace xgboost::tree { #if !defined(GTEST_TEST) DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); #endif // !defined(GTEST_TEST) /** * \struct DeviceHistogramStorage * * \summary Data storage for node histograms on device. Automatically expands. * * \tparam GradientSumT histogram entry type. * \tparam kStopGrowingSize Do not grow beyond this size * * \author Rory * \date 28/07/2018 */ template class DeviceHistogramStorage { private: using GradientSumT = GradientPairInt64; /*! \brief Map nidx to starting index of its histogram. */ std::map nidx_map_; // Large buffer of zeroed memory, caches histograms dh::device_vector data_; // If we run out of storage allocate one histogram at a time // in overflow. Not cached, overwritten when a new histogram // is requested dh::device_vector overflow_; std::map overflow_nidx_map_; int n_bins_; int device_id_; static constexpr size_t kNumItemsInGradientSum = sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); public: // Start with about 16mb DeviceHistogramStorage() { data_.reserve(1 << 22); } void Init(int device_id, int n_bins) { this->n_bins_ = n_bins; this->device_id_ = device_id; } void Reset() { auto d_data = data_.data().get(); dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); nidx_map_.clear(); overflow_nidx_map_.clear(); } [[nodiscard]] bool HistogramExists(int nidx) const { return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); } [[nodiscard]] int Bins() const { return n_bins_; } [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } dh::device_vector& Data() { return data_; } void AllocateHistograms(const std::vector& new_nidxs) { for (int nidx : new_nidxs) { CHECK(!HistogramExists(nidx)); } // Number of items currently used in data const size_t used_size = nidx_map_.size() * HistogramSize(); const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); if (used_size >= kStopGrowingSize) { // Use overflow // Delete previous entries overflow_nidx_map_.clear(); overflow_.resize(HistogramSize() * new_nidxs.size()); // Zero memory auto d_data = overflow_.data().get(); dh::LaunchN(overflow_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0; }); // Append new histograms for (int nidx : new_nidxs) { overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); } } else { CHECK_GE(data_.size(), used_size); // Expand if necessary if (data_.size() < new_used_size) { data_.resize(std::max(data_.size() * 2, new_used_size)); } // Append new histograms for (int nidx : new_nidxs) { nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); } } CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); } /** * \summary Return pointer to histogram memory for a given node. * \param nidx Tree node index. * \return hist pointer. */ common::Span GetNodeHistogram(int nidx) { CHECK(this->HistogramExists(nidx)); if (nidx_map_.find(nidx) != nidx_map_.cend()) { // Fetch from normal cache auto ptr = data_.data().get() + nidx_map_.at(nidx); return {reinterpret_cast(ptr), static_cast(n_bins_)}; } else { // Fetch from overflow auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); return {reinterpret_cast(ptr), static_cast(n_bins_)}; } } }; // Manage memory for a single GPU struct GPUHistMakerDevice { private: GPUHistEvaluator evaluator_; Context const* ctx_; std::shared_ptr column_sampler_; public: EllpackPageImpl const* page{nullptr}; common::Span feature_types; std::unique_ptr row_partitioner; DeviceHistogramStorage<> hist{}; dh::device_vector d_gpair; // storage for gpair; common::Span gpair; dh::device_vector monotone_constraints; // node idx for each sample dh::device_vector positions; TrainParam param; std::unique_ptr quantiser; dh::PinnedMemory pinned; dh::PinnedMemory pinned2; common::Monitor monitor; FeatureInteractionConstraintDevice interaction_constraints; std::unique_ptr sampler; std::unique_ptr feature_groups; GPUHistMakerDevice(Context const* ctx, bool is_external_memory, common::Span _feature_types, bst_row_t _n_rows, TrainParam _param, std::shared_ptr column_sampler, uint32_t n_features, BatchParam batch_param) : evaluator_{_param, n_features, ctx->gpu_id}, ctx_(ctx), feature_types{_feature_types}, param(std::move(_param)), column_sampler_(std::move(column_sampler)), interaction_constraints(param, n_features) { sampler = std::make_unique(ctx, _n_rows, batch_param, param.subsample, param.sampling_method, is_external_memory); if (!param.monotone_constraints.empty()) { // Copy assigning an empty vector causes an exception in MSVC debug builds monotone_constraints = param.monotone_constraints; } CHECK(column_sampler_); monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id)); } ~GPUHistMakerDevice() = default; void InitFeatureGroupsOnce() { if (!feature_groups) { CHECK(page); feature_groups = std::make_unique(page->Cuts(), page->is_dense, dh::MaxSharedMemoryOptin(ctx_->gpu_id), sizeof(GradientPairPrecise)); } } // Reset values for each update iteration void Reset(HostDeviceVector* dh_gpair, DMatrix* dmat, int64_t num_columns) { auto const& info = dmat->Info(); this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(), param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); this->interaction_constraints.Reset(); if (d_gpair.size() != dh_gpair->Size()) { d_gpair.resize(dh_gpair->Size()); } dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(), dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice)); auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat); page = sample.page; gpair = sample.gpair; this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, dmat->Info().IsColumnSplit(), ctx_->gpu_id); quantiser = std::make_unique(this->gpair); row_partitioner.reset(); // Release the device memory first before reallocating row_partitioner = std::make_unique(ctx_->gpu_id, sample.sample_rows); // Init histogram hist.Init(ctx_->gpu_id, page->Cuts().TotalBins()); hist.Reset(); this->InitFeatureGroupsOnce(); } GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) { int nidx = RegTree::kRoot; GPUTrainingParam gpu_param(param); auto sampled_features = column_sampler_->GetFeatureSet(0); sampled_features->SetDevice(ctx_->Device()); common::Span feature_set = interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; EvaluateSplitSharedInputs shared_inputs{ gpu_param, *quantiser, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, matrix.is_dense && !collective::IsDistributed() }; auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs); return split; } void EvaluateSplits(const std::vector& candidates, const RegTree& tree, common::Span pinned_candidates_out) { if (candidates.empty()) return; dh::TemporaryArray d_node_inputs(2 * candidates.size()); dh::TemporaryArray splits_out(2 * candidates.size()); std::vector nidx(2 * candidates.size()); auto h_node_inputs = pinned2.GetSpan(2 * candidates.size()); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, // is_dense represents the local data matrix.is_dense && !collective::IsDistributed()}; dh::TemporaryArray entries(2 * candidates.size()); // Store the feature set ptrs so they dont go out of scope before the kernel is called std::vector>> feature_sets; for (std::size_t i = 0; i < candidates.size(); i++) { auto candidate = candidates.at(i); int left_nidx = tree[candidate.nid].LeftChild(); int right_nidx = tree[candidate.nid].RightChild(); nidx[i * 2] = left_nidx; nidx[i * 2 + 1] = right_nidx; auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx)); left_sampled_features->SetDevice(ctx_->Device()); feature_sets.emplace_back(left_sampled_features); common::Span left_feature_set = interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx); auto right_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(right_nidx)); right_sampled_features->SetDevice(ctx_->Device()); feature_sets.emplace_back(right_sampled_features); common::Span right_feature_set = interaction_constraints.Query(right_sampled_features->DeviceSpan(), right_nidx); h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum, left_feature_set, hist.GetNodeHistogram(left_nidx)}; h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum, right_feature_set, hist.GetNodeHistogram(right_nidx)}; } bst_feature_t max_active_features = 0; for (auto input : h_node_inputs) { max_active_features = std::max(max_active_features, static_cast(input.feature_set.size())); } dh::safe_cuda(cudaMemcpyAsync( d_node_inputs.data().get(), h_node_inputs.data(), h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs), shared_inputs, dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); dh::DefaultStream().Sync(); } void BuildHist(int nidx) { auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id), feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist, *quantiser); } // Attempt to do subtraction trick // return true if succeeded bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) { return false; } auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent); auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); dh::LaunchN(page->Cuts().TotalBins(), [=] __device__(size_t idx) { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); return true; } // Extra data for each node that is passed // to the update position function struct NodeSplitData { RegTree::Node split_node; FeatureType split_type; common::KCatBitField node_cats; }; void UpdatePosition(std::vector const& candidates, RegTree* p_tree) { if (candidates.empty()) { return; } std::vector nidx(candidates.size()); std::vector left_nidx(candidates.size()); std::vector right_nidx(candidates.size()); std::vector split_data(candidates.size()); for (size_t i = 0; i < candidates.size(); i++) { auto const& e = candidates[i]; RegTree::Node split_node = (*p_tree)[e.nid]; auto split_type = p_tree->NodeSplitType(e.nid); nidx.at(i) = e.nid; left_nidx.at(i) = split_node.LeftChild(); right_nidx.at(i) = split_node.RightChild(); split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)}; CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat); } auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); row_partitioner->UpdatePositionBatch( nidx, left_nidx, right_nidx, split_data, [=] __device__(bst_uint ridx, const NodeSplitData& data) { // given a row index, returns the node id it belongs to float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex()); // Missing value bool go_left = true; if (isnan(cut_value)) { go_left = data.split_node.DefaultLeft(); } else { if (data.split_type == FeatureType::kCategorical) { go_left = common::Decision(data.node_cats.Bits(), cut_value); } else { go_left = cut_value <= data.split_node.SplitCond(); } } return go_left; }); } // After tree update is finished, update the position of all training // instances to their final leaf. This information is used later to update the // prediction cache void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task, HostDeviceVector* p_out_position) { // Prediction cache will not be used with external memory if (!p_fmat->SingleColBlock()) { if (task.UpdateTreeLeaf()) { LOG(FATAL) << "Current objective function can not be used with external memory."; } p_out_position->Resize(0); positions.clear(); return; } dh::TemporaryArray d_nodes(p_tree->GetNodes().size()); dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(), d_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice)); auto const& h_split_types = p_tree->GetSplitTypes(); auto const& categories = p_tree->GetSplitCategories(); auto const& categories_segments = p_tree->GetSplitCategoriesPtr(); dh::caching_device_vector d_split_types; dh::caching_device_vector d_categories; dh::caching_device_vector d_categories_segments; if (!categories.empty()) { dh::CopyToD(h_split_types, &d_split_types); dh::CopyToD(categories, &d_categories); dh::CopyToD(categories_segments, &d_categories_segments); } FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), p_out_position); } void FinalisePositionInPage( EllpackPageImpl const* page, const common::Span d_nodes, common::Span d_feature_types, common::Span categories, common::Span categories_segments, HostDeviceVector* p_out_position) { auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_gpair = this->gpair; p_out_position->SetDevice(ctx_->gpu_id); p_out_position->Resize(row_partitioner->GetRows().size()); auto new_position_op = [=] __device__(size_t row_id, int position) { // What happens if user prune the tree? if (!d_matrix.IsInRange(row_id)) { return RowPartitioner::kIgnoredTreePosition; } auto node = d_nodes[position]; while (!node.IsLeaf()) { bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex()); // Missing value if (isnan(element)) { position = node.DefaultChild(); } else { bool go_left = true; if (common::IsCat(d_feature_types, position)) { auto node_cats = categories.subspan(categories_segments[position].beg, categories_segments[position].size); go_left = common::Decision(node_cats, element); } else { go_left = element <= node.SplitCond(); } if (go_left) { position = node.LeftChild(); } else { position = node.RightChild(); } } node = d_nodes[position]; } return position; }; // NOLINT auto d_out_position = p_out_position->DeviceSpan(); row_partitioner->FinalisePosition(d_out_position, new_position_op); auto s_position = p_out_position->ConstDeviceSpan(); positions.resize(s_position.size()); dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(), s_position.size_bytes(), cudaMemcpyDeviceToDevice, ctx_->CUDACtx()->Stream())); dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) { bst_node_t position = d_out_position[idx]; bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f; d_out_position[idx] = is_row_sampled ? ~position : position; }); } bool UpdatePredictionCache(linalg::MatrixView out_preds_d, RegTree const* p_tree) { if (positions.empty()) { return false; } CHECK(p_tree); CHECK(out_preds_d.Device().IsCUDA()); CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal()); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); auto d_position = dh::ToSpan(positions); CHECK_EQ(out_preds_d.Size(), d_position.size()); auto const& h_nodes = p_tree->GetNodes(); dh::caching_device_vector nodes(h_nodes.size()); dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(), h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice, ctx_->CUDACtx()->Stream())); auto d_nodes = dh::ToSpan(nodes); CHECK_EQ(out_preds_d.Shape(1), 1); dh::LaunchN(d_position.size(), ctx_->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t idx) mutable { bst_node_t nidx = d_position[idx]; auto weight = d_nodes[nidx].LeafValue(); out_preds_d(idx, 0) += weight; }); return true; } // num histograms is the number of contiguous histograms in memory to reduce over void AllReduceHist(int nidx, int num_histograms) { monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); using ReduceT = typename std::remove_pointer::type::ValueT; collective::AllReduce( ctx_->gpu_id, reinterpret_cast(d_node_hist), page->Cuts().TotalBins() * 2 * num_histograms); monitor.Stop("AllReduce"); } /** * \brief Build GPU local histograms for the left and right child of some parent node */ void BuildHistLeftRight(std::vector const& candidates, const RegTree& tree) { if (candidates.empty()) return; // Some nodes we will manually compute histograms // others we will do by subtraction std::vector hist_nidx; std::vector subtraction_nidx; for (auto& e : candidates) { // Decide whether to build the left histogram or right histogram // Use sum of Hessian as a heuristic to select node with fewest training instances bool fewer_right = e.split.right_sum.GetQuantisedHess() < e.split.left_sum.GetQuantisedHess(); if (fewer_right) { hist_nidx.emplace_back(tree[e.nid].RightChild()); subtraction_nidx.emplace_back(tree[e.nid].LeftChild()); } else { hist_nidx.emplace_back(tree[e.nid].LeftChild()); subtraction_nidx.emplace_back(tree[e.nid].RightChild()); } } std::vector all_new = hist_nidx; all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); // Allocate the histograms // Guaranteed contiguous memory hist.AllocateHistograms(all_new); for (auto nidx : hist_nidx) { this->BuildHist(nidx); } // Reduce all in one go // This gives much better latency in a distributed setting // when processing a large batch this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); for (size_t i = 0; i < subtraction_nidx.size(); i++) { auto build_hist_nidx = hist_nidx.at(i); auto subtraction_trick_nidx = subtraction_nidx.at(i); auto parent_nidx = candidates.at(i).nid; if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) { // Calculate other histogram manually this->BuildHist(subtraction_trick_nidx); this->AllReduceHist(subtraction_trick_nidx, 1); } } } void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) { RegTree& tree = *p_tree; // Sanity check - have we created a leaf with no training instances? if (!collective::IsDistributed() && row_partitioner) { CHECK(row_partitioner->GetRows(candidate.nid).size() > 0) << "No training instances in this leaf!"; } auto base_weight = candidate.base_weight; auto left_weight = candidate.left_weight * param.learning_rate; auto right_weight = candidate.right_weight * param.learning_rate; auto parent_hess = quantiser ->ToFloatingPoint(candidate.split.left_sum + candidate.split.right_sum) .GetHess(); auto left_hess = quantiser->ToFloatingPoint(candidate.split.left_sum).GetHess(); auto right_hess = quantiser->ToFloatingPoint(candidate.split.right_sum).GetHess(); auto is_cat = candidate.split.is_cat; if (is_cat) { // should be set to nan in evaluation split. CHECK(common::CheckNAN(candidate.split.fvalue)); std::vector split_cats; auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex); split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0); CHECK_LE(split_cats.size(), h_cats.size()); std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); tree.ExpandCategorical( candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess, left_hess, right_hess); } else { CHECK(!common::CheckNAN(candidate.split.fvalue)); tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue, candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess, left_hess, right_hess); } evaluator_.ApplyTreeSplit(candidate, p_tree); const auto& parent = tree[candidate.nid]; interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(), parent.RightChild()); } GPUExpandEntry InitRoot(RegTree* p_tree) { constexpr bst_node_t kRootNIdx = 0; dh::XGBCachingDeviceAllocator alloc; auto quantiser = *this->quantiser; auto gpair_it = dh::MakeTransformIterator( dh::tbegin(gpair), [=] __device__(auto const &gpair) { return quantiser.ToFixedPoint(gpair); }); GradientPairInt64 root_sum_quantised = dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(), GradientPairInt64{}, thrust::plus{}); using ReduceT = typename decltype(root_sum_quantised)::ValueT; collective::Allreduce( reinterpret_cast(&root_sum_quantised), 2); hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); this->AllReduceHist(kRootNIdx, 1); // Remember root stats auto root_sum = quantiser.ToFloatingPoint(root_sum_quantised); p_tree->Stat(kRootNIdx).sum_hess = root_sum.GetHess(); auto weight = CalcWeight(param, root_sum); p_tree->Stat(kRootNIdx).base_weight = weight; (*p_tree)[kRootNIdx].SetLeaf(param.learning_rate * weight); // Generate first split auto root_entry = this->EvaluateRootSplit(root_sum_quantised); return root_entry; } void UpdateTree(HostDeviceVector* gpair_all, DMatrix* p_fmat, ObjInfo const* task, RegTree* p_tree, HostDeviceVector* p_out_position) { auto& tree = *p_tree; // Process maximum 32 nodes at a time Driver driver(param, 32); monitor.Start("Reset"); this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_); monitor.Stop("Reset"); monitor.Start("InitRoot"); driver.Push({this->InitRoot(p_tree)}); monitor.Stop("InitRoot"); // The set of leaves that can be expanded asynchronously auto expand_set = driver.Pop(); while (!expand_set.empty()) { for (auto& candidate : expand_set) { this->ApplySplit(candidate, p_tree); } // Get the candidates we are allowed to expand further // e.g. We do not bother further processing nodes whose children are beyond max depth std::vector filtered_expand_set; std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set), [&](const auto& e) { return driver.IsChildValid(e); }); auto new_candidates = pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); monitor.Start("UpdatePosition"); // Update position is only run when child is valid, instead of right after apply // split (as in approx tree method). Hense we have the finalise position call // in GPU Hist. this->UpdatePosition(filtered_expand_set, p_tree); monitor.Stop("UpdatePosition"); monitor.Start("BuildHist"); this->BuildHistLeftRight(filtered_expand_set, tree); monitor.Stop("BuildHist"); monitor.Start("EvaluateSplits"); this->EvaluateSplits(filtered_expand_set, *p_tree, new_candidates); monitor.Stop("EvaluateSplits"); dh::DefaultStream().Sync(); driver.Push(new_candidates.begin(), new_candidates.end()); expand_set = driver.Pop(); } monitor.Start("FinalisePosition"); this->FinalisePosition(p_tree, p_fmat, *task, p_out_position); monitor.Stop("FinalisePosition"); } }; class GPUHistMaker : public TreeUpdater { using GradientSumT = GradientPairPrecise; public: explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {}; void Configure(const Args& args) override { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Hist]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); dh::CheckComputeCapability(); initialised_ = false; monitor_.Init("updater_gpu_hist"); } void LoadConfig(Json const& in) override { auto const& config = get(in); FromJson(config.at("hist_train_param"), &this->hist_maker_param_); initialised_ = false; } void SaveConfig(Json* p_out) const override { auto& out = *p_out; out["hist_train_param"] = ToJson(hist_maker_param_); } ~GPUHistMaker() { // NOLINT dh::GlobalMemoryLogger().Log(); } void Update(TrainParam const* param, linalg::Matrix* gpair, DMatrix* dmat, common::Span> out_position, const std::vector& trees) override { monitor_.Start("Update"); CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented(); auto gpair_hdv = gpair->Data(); // build tree try { std::size_t t_idx{0}; for (xgboost::RegTree* tree : trees) { this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]); this->hist_maker_param_.CheckTreesSynchronized(tree); ++t_idx; } dh::safe_cuda(cudaGetLastError()); } catch (const std::exception& e) { LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl; } monitor_.Stop("Update"); } void InitDataOnce(TrainParam const* param, DMatrix* dmat) { CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device"; info_ = &dmat->Info(); // Synchronise the column sampling seed uint32_t column_sampling_seed = common::GlobalRandom()(); collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0); this->column_sampler_ = std::make_shared(column_sampling_seed); auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()}; dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); info_->feature_types.SetDevice(ctx_->gpu_id); maker = std::make_unique( ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param, column_sampler_, info_->num_col_, batch_param); p_last_fmat_ = dmat; initialised_ = true; } void InitData(TrainParam const* param, DMatrix* dmat, RegTree const* p_tree) { if (!initialised_) { monitor_.Start("InitDataOnce"); this->InitDataOnce(param, dmat); monitor_.Stop("InitDataOnce"); } p_last_tree_ = p_tree; CHECK(hist_maker_param_.GetInitialised()); } void UpdateTree(TrainParam const* param, HostDeviceVector* gpair, DMatrix* p_fmat, RegTree* p_tree, HostDeviceVector* p_out_position) { monitor_.Start("InitData"); this->InitData(param, p_fmat, p_tree); monitor_.Stop("InitData"); gpair->SetDevice(ctx_->gpu_id); maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position); } bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView p_out_preds) override { if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) { return false; } monitor_.Start("UpdatePredictionCache"); bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_); monitor_.Stop("UpdatePredictionCache"); return result; } MetaInfo* info_{}; // NOLINT std::unique_ptr maker; // NOLINT [[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; } [[nodiscard]] bool HasNodePosition() const override { return true; } private: bool initialised_{false}; HistMakerTrainParam hist_maker_param_; DMatrix* p_last_fmat_{nullptr}; RegTree const* p_last_tree_{nullptr}; ObjInfo const* task_{nullptr}; common::Monitor monitor_; std::shared_ptr column_sampler_; }; #if !defined(GTEST_TEST) XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist") .describe("Grow tree with GPU.") .set_body([](Context const* ctx, ObjInfo const* task) { return new GPUHistMaker(ctx, task); }); #endif // !defined(GTEST_TEST) class GPUGlobalApproxMaker : public TreeUpdater { public: explicit GPUGlobalApproxMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {}; void Configure(Args const& args) override { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Approx]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) { LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU."; } dh::CheckComputeCapability(); initialised_ = false; monitor_.Init(this->Name()); } void LoadConfig(Json const& in) override { auto const& config = get(in); FromJson(config.at("hist_train_param"), &this->hist_maker_param_); initialised_ = false; } void SaveConfig(Json* p_out) const override { auto& out = *p_out; out["hist_train_param"] = ToJson(hist_maker_param_); } ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); } void Update(TrainParam const* param, linalg::Matrix* gpair, DMatrix* p_fmat, common::Span> out_position, const std::vector& trees) override { monitor_.Start("Update"); this->InitDataOnce(p_fmat); // build tree hess_.resize(gpair->Size()); auto hess = dh::ToSpan(hess_); gpair->SetDevice(ctx_->Device()); auto d_gpair = gpair->Data()->ConstDeviceSpan(); auto cuctx = ctx_->CUDACtx(); thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess), [=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); }); auto const& info = p_fmat->Info(); info.feature_types.SetDevice(ctx_->Device()); auto batch = BatchParam{param->max_bin, hess, !task_->const_hess}; maker_ = std::make_unique( ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_, *param, column_sampler_, info.num_col_, batch); std::size_t t_idx{0}; for (xgboost::RegTree* tree : trees) { this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]); this->hist_maker_param_.CheckTreesSynchronized(tree); ++t_idx; } monitor_.Stop("Update"); } void InitDataOnce(DMatrix* p_fmat) { if (this->initialised_) { return; } monitor_.Start(__func__); CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal(); // Synchronise the column sampling seed uint32_t column_sampling_seed = common::GlobalRandom()(); collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0); this->column_sampler_ = std::make_shared(column_sampling_seed); p_last_fmat_ = p_fmat; initialised_ = true; monitor_.Stop(__func__); } void InitData(DMatrix* p_fmat, RegTree const* p_tree) { this->InitDataOnce(p_fmat); p_last_tree_ = p_tree; CHECK(hist_maker_param_.GetInitialised()); } void UpdateTree(HostDeviceVector* gpair, DMatrix* p_fmat, RegTree* p_tree, HostDeviceVector* p_out_position) { monitor_.Start("InitData"); this->InitData(p_fmat, p_tree); monitor_.Stop("InitData"); gpair->SetDevice(ctx_->gpu_id); maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position); } bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView p_out_preds) override { if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) { return false; } monitor_.Start("UpdatePredictionCache"); bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_); monitor_.Stop("UpdatePredictionCache"); return result; } [[nodiscard]] char const* Name() const override { return "grow_gpu_approx"; } [[nodiscard]] bool HasNodePosition() const override { return true; } private: bool initialised_{false}; HistMakerTrainParam hist_maker_param_; dh::device_vector hess_; std::shared_ptr column_sampler_; std::unique_ptr maker_; DMatrix* p_last_fmat_{nullptr}; RegTree const* p_last_tree_{nullptr}; ObjInfo const* task_{nullptr}; common::Monitor monitor_; }; #if !defined(GTEST_TEST) XGBOOST_REGISTER_TREE_UPDATER(GPUApproxMaker, "grow_gpu_approx") .describe("Grow tree with GPU.") .set_body([](Context const* ctx, ObjInfo const* task) { return new GPUGlobalApproxMaker(ctx, task); }); #endif // !defined(GTEST_TEST) } // namespace xgboost::tree