/** * Copyright 2020-2024, XGBoost Contributors */ #ifndef XGBOOST_COMMON_QUANTILE_CUH_ #define XGBOOST_COMMON_QUANTILE_CUH_ #include // for any_of #include "categorical.h" #include "cuda_context.cuh" // for CUDAContext #include "device_helpers.cuh" #include "error_msg.h" // for InvalidMaxBin #include "quantile.h" #include "timer.h" #include "xgboost/data.h" #include "xgboost/span.h" namespace xgboost { namespace common { class HistogramCuts; using WQSketch = WQuantileSketch; using SketchEntry = WQSketch::Entry; namespace detail { struct SketchUnique { XGBOOST_DEVICE bool operator()(SketchEntry const& a, SketchEntry const& b) const { return a.value - b.value == 0; } }; } // namespace detail /*! * \brief A container that holds the device sketches. Sketching is performed per-column, * but fused into single operation for performance. */ class SketchContainer { public: static constexpr float kFactor = WQSketch::kFactor; using OffsetT = bst_idx_t; static_assert(sizeof(OffsetT) == sizeof(size_t), "Wrong type for sketch element offset."); private: Monitor timer_; HostDeviceVector feature_types_; bst_idx_t num_rows_; bst_feature_t num_columns_; int32_t num_bins_; DeviceOrd device_; // Double buffer as neither prune nor merge can be performed inplace. dh::device_vector entries_a_; dh::device_vector entries_b_; bool current_buffer_ {true}; // The container is just a CSC matrix. HostDeviceVector columns_ptr_; HostDeviceVector columns_ptr_b_; bool has_categorical_{false}; dh::device_vector& Current() { if (current_buffer_) { return entries_a_; } else { return entries_b_; } } dh::device_vector& Other() { if (!current_buffer_) { return entries_a_; } else { return entries_b_; } } dh::device_vector const& Current() const { return const_cast(this)->Current(); } dh::device_vector const& Other() const { return const_cast(this)->Other(); } void Alternate() { current_buffer_ = !current_buffer_; } // Get the span of one column. Span Column(bst_feature_t i) { auto data = dh::ToSpan(this->Current()); auto h_ptr = columns_ptr_.ConstHostSpan(); auto c = data.subspan(h_ptr[i], h_ptr[i+1] - h_ptr[i]); return c; } public: /* \breif GPU quantile structure, with sketch data for each columns. * * \param max_bin Maximum number of bins per columns * \param num_columns Total number of columns in dataset. * \param num_rows Total number of rows in known dataset (typically the rows in current worker). * \param device GPU ID. */ SketchContainer(HostDeviceVector const& feature_types, bst_bin_t max_bin, bst_feature_t num_columns, bst_idx_t num_rows, DeviceOrd device) : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} { CHECK(device.IsCUDA()); // Initialize Sketches for this dmatrix this->columns_ptr_.SetDevice(device_); this->columns_ptr_.Resize(num_columns + 1, 0); this->columns_ptr_b_.SetDevice(device_); this->columns_ptr_b_.Resize(num_columns + 1, 0); this->feature_types_.Resize(feature_types.Size()); this->feature_types_.Copy(feature_types); // Pull to device. this->feature_types_.SetDevice(device); this->feature_types_.ConstDeviceSpan(); this->feature_types_.ConstHostSpan(); auto d_feature_types = feature_types_.ConstDeviceSpan(); has_categorical_ = !d_feature_types.empty() && thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{}); CHECK_GE(max_bin, 2) << error::InvalidMaxBin(); timer_.Init(__func__); } /* \brief Return GPU ID for this container. */ [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; } /* \brief Whether the predictor matrix contains categorical features. */ bool HasCategorical() const { return has_categorical_; } /* \brief Accumulate weights of duplicated entries in input. */ size_t ScanInput(Context const* ctx, Span entries, Span d_columns_ptr_in); /* Fix rounding error and re-establish invariance. The error is mostly generated by the * addition inside `RMinNext` and subtraction in `RMaxPrev`. */ void FixError(); /* \brief Push sorted entries. * * \param entries Sorted entries. * \param columns_ptr CSC pointer for entries. * \param cuts_ptr CSC pointer for cuts. * \param total_cuts Total number of cuts, equal to the back of cuts_ptr. * \param weights (optional) data weights. */ void Push(Context const* ctx, Span entries, Span columns_ptr, common::Span cuts_ptr, size_t total_cuts, Span weights = {}); /** * @brief Prune the quantile structure. * * @param to The maximum size of pruned quantile. If the size of quantile structure is * already less than `to`, then no operation is performed. */ void Prune(Context const* ctx, size_t to); /** * @brief Merge another set of sketch. * * @param that_columns_ptr Column pointer of the quantile summary being merged. * @param that Columns of the other quantile summary. */ void Merge(Context const* ctx, Span that_columns_ptr, Span that); /** * @brief Shrink the internal data structure to reduce memory usage. Can be used after * prune. */ void ShrinkToFit() { this->Current().shrink_to_fit(); this->Other().clear(); this->Other().shrink_to_fit(); } /* \brief Merge quantiles from other GPU workers. */ void AllReduce(Context const* ctx, bool is_column_split); /* \brief Create the final histogram cut values. */ void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split); Span Data() const { return {this->Current().data().get(), this->Current().size()}; } HostDeviceVector const& FeatureTypes() const { return feature_types_; } Span ColumnsPtr() const { return this->columns_ptr_.ConstDeviceSpan(); } SketchContainer(SketchContainer&&) = default; SketchContainer& operator=(SketchContainer&&) = default; SketchContainer(const SketchContainer&) = delete; SketchContainer& operator=(const SketchContainer&) = delete; /* \brief Removes all the duplicated elements in quantile structure. */ template > size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to{}) { timer_.Start(__func__); dh::safe_cuda(cudaSetDevice(device_.ordinal)); this->columns_ptr_.SetDevice(device_); Span d_column_scan = this->columns_ptr_.DeviceSpan(); CHECK_EQ(d_column_scan.size(), num_columns_ + 1); Span entries = dh::ToSpan(this->Current()); HostDeviceVector scan_out(d_column_scan.size()); scan_out.SetDevice(device_); auto d_scan_out = scan_out.DeviceSpan(); d_column_scan = this->columns_ptr_.DeviceSpan(); size_t n_uniques = dh::SegmentedUnique( ctx->CUDACtx()->CTP(), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(), entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(), detail::SketchUnique{}, key_comp); this->columns_ptr_.Copy(scan_out); CHECK(!this->columns_ptr_.HostCanRead()); this->Current().resize(n_uniques); timer_.Stop(__func__); return n_uniques; } }; } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_QUANTILE_CUH_