/*! * Copyright 2020 by XGBoost Contributors */ #ifndef EVALUATE_SPLITS_CUH_ #define EVALUATE_SPLITS_CUH_ #include #include "../../common/categorical.h" #include "../../common/cuda_pinned_allocator.h" #include "../split_evaluator.h" #include "../updater_gpu_common.cuh" #include "expand_entry.cuh" #include "histogram.cuh" namespace xgboost { namespace common { class HistogramCuts; } namespace tree { // Inputs specific to each node struct EvaluateSplitInputs { int nidx; int depth; GradientPairInt64 parent_sum; common::Span feature_set; common::Span gradient_histogram; }; // Inputs necessary for all nodes struct EvaluateSplitSharedInputs { GPUTrainingParam param; GradientQuantiser rounding; common::Span feature_types; common::Span feature_segments; common::Span feature_values; common::Span min_fvalue; bool is_dense; [[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; } [[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const { return feature_segments[fidx + 1] - feature_segments[fidx]; } }; // Used to return internal storage regions for categoricals // Usable on device struct CatAccessor { common::Span cat_storage; std::size_t node_categorical_storage_size; XGBOOST_DEVICE common::Span GetNodeCatStorage(bst_node_t nidx) { return this->cat_storage.subspan(nidx * this->node_categorical_storage_size, this->node_categorical_storage_size); } }; class GPUHistEvaluator { using CatST = common::CatBitField::value_type; // categorical storage type // use pinned memory to stage the categories, used for sort based splits. using Alloc = xgboost::common::cuda::pinned_allocator; private: TreeEvaluator tree_evaluator_; // storage for categories for each node, used for sort based splits. dh::device_vector split_cats_; // host storage for categories for each node, used for sort based splits. std::vector h_split_cats_; // stream for copying categories from device back to host for expanding the decision tree. dh::CUDAStream copy_stream_; // storage for sorted index of feature histogram, used for sort based splits. dh::device_vector cat_sorted_idx_; // cached input for sorting the histogram, used for sort based splits. using SortPair = thrust::tuple; dh::device_vector sort_input_; // cache for feature index dh::device_vector feature_idx_; // Training param used for evaluation TrainParam param_; // Do we have any categorical features that require sorting histograms? // use this to skip the expensive sort step bool need_sort_histogram_ = false; bool has_categoricals_ = false; // Number of elements of categorical storage type // needed to hold categoricals for a single mode std::size_t node_categorical_storage_size_ = 0; // Is the data split column-wise? bool is_column_split_ = false; DeviceOrd device_; // Copy the categories from device to host asynchronously. void CopyToHost( const std::vector& nidx); /** * \brief Get host category storage of nidx for internal calculation. */ auto HostCatStorage(const std::vector &nidx) { if (!has_categoricals_) return CatAccessor{}; auto max_nidx = *std::max_element(nidx.begin(), nidx.end()); std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_; if (h_split_cats_.size() < min_size) { h_split_cats_.resize(min_size); } return CatAccessor{{h_split_cats_.data(), h_split_cats_.size()}, node_categorical_storage_size_}; } /** * @brief Get device category storage of nidx for internal calculation. */ auto DeviceCatStorage(const std::vector &nidx) { if (!has_categoricals_) return CatAccessor{}; auto max_nidx = *std::max_element(nidx.begin(), nidx.end()); std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_; if (split_cats_.size() < min_size) { split_cats_.resize(min_size); } return CatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_}; } /** * \brief Get sorted index storage based on the left node of inputs. */ auto SortedIdx(int num_nodes, bst_bin_t total_bins) { if (!need_sort_histogram_) return common::Span{}; cat_sorted_idx_.resize(num_nodes * total_bins); return dh::ToSpan(cat_sorted_idx_); } auto SortInput(int num_nodes, bst_feature_t total_bins) { if(!need_sort_histogram_) return common::Span(); sort_input_.resize(num_nodes * total_bins); return dh::ToSpan(sort_input_); } public: GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, DeviceOrd device) : tree_evaluator_{param, n_features, device}, param_{param} {} /** * \brief Reset the evaluator, should be called before any use. */ void Reset(common::HistogramCuts const &cuts, common::Span ft, bst_feature_t n_features, TrainParam const ¶m, bool is_column_split, DeviceOrd device); /** * \brief Get host category storage for nidx. Different from the internal version, this * returns strictly 1 node. */ [[nodiscard]] common::Span GetHostNodeCats(bst_node_t nidx) const { copy_stream_.View().Sync(); auto cats_out = common::Span{h_split_cats_}.subspan( nidx * node_categorical_storage_size_, node_categorical_storage_size_); return cats_out; } [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) { copy_stream_.View().Sync(); if (has_categoricals_) { CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_}; return common::KCatBitField{accessor.GetNodeCatStorage(nidx)}; } else { return common::KCatBitField{}; } } /** * \brief Add a split to the internal tree evaluator. */ void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) { auto &tree = *p_tree; // Set up child constraints auto left_child = tree[candidate.nid].LeftChild(); auto right_child = tree[candidate.nid].RightChild(); tree_evaluator_.AddSplit(candidate.nid, left_child, right_child, tree[candidate.nid].SplitIndex(), candidate.left_weight, candidate.right_weight); } auto GetEvaluator() { return tree_evaluator_.GetEvaluator(); } /** * \brief Sort the histogram based on output to obtain contiguous partitions. */ common::Span SortHistogram(common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator); // impl of evaluate splits, contains CUDA kernels so it's public void LaunchEvaluateSplits( bst_feature_t max_active_features, common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits); /** * \brief Evaluate splits for left and right nodes. */ void EvaluateSplits(Context const* ctx, const std::vector &nidx, bst_feature_t max_active_features, common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); /** * \brief Evaluate splits for root node. */ GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs); }; } // namespace tree } // namespace xgboost #endif // EVALUATE_SPLITS_CUH_