219 lines
7.7 KiB
Plaintext
219 lines
7.7 KiB
Plaintext
/**
|
|
* Copyright 2020-2024, XGBoost Contributors
|
|
*/
|
|
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
|
|
#define XGBOOST_COMMON_QUANTILE_CUH_
|
|
|
|
#include <thrust/logical.h> // for any_of
|
|
|
|
#include "categorical.h"
|
|
#include "cuda_context.cuh" // for CUDAContext
|
|
#include "device_helpers.cuh"
|
|
#include "error_msg.h" // for InvalidMaxBin
|
|
#include "quantile.h"
|
|
#include "timer.h"
|
|
#include "xgboost/data.h"
|
|
#include "xgboost/span.h"
|
|
|
|
namespace xgboost {
|
|
namespace common {
|
|
|
|
class HistogramCuts;
|
|
using WQSketch = WQuantileSketch<bst_float, bst_float>;
|
|
using SketchEntry = WQSketch::Entry;
|
|
|
|
namespace detail {
|
|
struct SketchUnique {
|
|
XGBOOST_DEVICE bool operator()(SketchEntry const& a, SketchEntry const& b) const {
|
|
return a.value - b.value == 0;
|
|
}
|
|
};
|
|
} // namespace detail
|
|
|
|
/*!
|
|
* \brief A container that holds the device sketches. Sketching is performed per-column,
|
|
* but fused into single operation for performance.
|
|
*/
|
|
class SketchContainer {
|
|
public:
|
|
static constexpr float kFactor = WQSketch::kFactor;
|
|
using OffsetT = bst_idx_t;
|
|
static_assert(sizeof(OffsetT) == sizeof(size_t), "Wrong type for sketch element offset.");
|
|
|
|
private:
|
|
Monitor timer_;
|
|
HostDeviceVector<FeatureType> feature_types_;
|
|
bst_idx_t num_rows_;
|
|
bst_feature_t num_columns_;
|
|
int32_t num_bins_;
|
|
DeviceOrd device_;
|
|
|
|
// Double buffer as neither prune nor merge can be performed inplace.
|
|
dh::device_vector<SketchEntry> entries_a_;
|
|
dh::device_vector<SketchEntry> entries_b_;
|
|
bool current_buffer_ {true};
|
|
// The container is just a CSC matrix.
|
|
HostDeviceVector<OffsetT> columns_ptr_;
|
|
HostDeviceVector<OffsetT> columns_ptr_b_;
|
|
|
|
bool has_categorical_{false};
|
|
|
|
dh::device_vector<SketchEntry>& Current() {
|
|
if (current_buffer_) {
|
|
return entries_a_;
|
|
} else {
|
|
return entries_b_;
|
|
}
|
|
}
|
|
dh::device_vector<SketchEntry>& Other() {
|
|
if (!current_buffer_) {
|
|
return entries_a_;
|
|
} else {
|
|
return entries_b_;
|
|
}
|
|
}
|
|
dh::device_vector<SketchEntry> const& Current() const {
|
|
return const_cast<SketchContainer*>(this)->Current();
|
|
}
|
|
dh::device_vector<SketchEntry> const& Other() const {
|
|
return const_cast<SketchContainer*>(this)->Other();
|
|
}
|
|
void Alternate() {
|
|
current_buffer_ = !current_buffer_;
|
|
}
|
|
|
|
// Get the span of one column.
|
|
Span<SketchEntry> Column(bst_feature_t i) {
|
|
auto data = dh::ToSpan(this->Current());
|
|
auto h_ptr = columns_ptr_.ConstHostSpan();
|
|
auto c = data.subspan(h_ptr[i], h_ptr[i+1] - h_ptr[i]);
|
|
return c;
|
|
}
|
|
|
|
public:
|
|
/* \breif GPU quantile structure, with sketch data for each columns.
|
|
*
|
|
* \param max_bin Maximum number of bins per columns
|
|
* \param num_columns Total number of columns in dataset.
|
|
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
|
* \param device GPU ID.
|
|
*/
|
|
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, bst_bin_t max_bin,
|
|
bst_feature_t num_columns, bst_idx_t num_rows, DeviceOrd device)
|
|
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
|
CHECK(device.IsCUDA());
|
|
// Initialize Sketches for this dmatrix
|
|
this->columns_ptr_.SetDevice(device_);
|
|
this->columns_ptr_.Resize(num_columns + 1, 0);
|
|
this->columns_ptr_b_.SetDevice(device_);
|
|
this->columns_ptr_b_.Resize(num_columns + 1, 0);
|
|
|
|
this->feature_types_.Resize(feature_types.Size());
|
|
this->feature_types_.Copy(feature_types);
|
|
// Pull to device.
|
|
this->feature_types_.SetDevice(device);
|
|
this->feature_types_.ConstDeviceSpan();
|
|
this->feature_types_.ConstHostSpan();
|
|
|
|
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
|
has_categorical_ =
|
|
!d_feature_types.empty() &&
|
|
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
|
|
CHECK_GE(max_bin, 2) << error::InvalidMaxBin();
|
|
|
|
timer_.Init(__func__);
|
|
}
|
|
/* \brief Return GPU ID for this container. */
|
|
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
|
|
/* \brief Whether the predictor matrix contains categorical features. */
|
|
bool HasCategorical() const { return has_categorical_; }
|
|
/* \brief Accumulate weights of duplicated entries in input. */
|
|
size_t ScanInput(Context const* ctx, Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
|
|
/* Fix rounding error and re-establish invariance. The error is mostly generated by the
|
|
* addition inside `RMinNext` and subtraction in `RMaxPrev`. */
|
|
void FixError();
|
|
|
|
/* \brief Push sorted entries.
|
|
*
|
|
* \param entries Sorted entries.
|
|
* \param columns_ptr CSC pointer for entries.
|
|
* \param cuts_ptr CSC pointer for cuts.
|
|
* \param total_cuts Total number of cuts, equal to the back of cuts_ptr.
|
|
* \param weights (optional) data weights.
|
|
*/
|
|
void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
|
|
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
|
|
/**
|
|
* @brief Prune the quantile structure.
|
|
*
|
|
* @param to The maximum size of pruned quantile. If the size of quantile structure is
|
|
* already less than `to`, then no operation is performed.
|
|
*/
|
|
void Prune(Context const* ctx, size_t to);
|
|
/**
|
|
* @brief Merge another set of sketch.
|
|
*
|
|
* @param that_columns_ptr Column pointer of the quantile summary being merged.
|
|
* @param that Columns of the other quantile summary.
|
|
*/
|
|
void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
|
|
Span<SketchEntry const> that);
|
|
/**
|
|
* @brief Shrink the internal data structure to reduce memory usage. Can be used after
|
|
* prune.
|
|
*/
|
|
void ShrinkToFit() {
|
|
this->Current().shrink_to_fit();
|
|
this->Other().clear();
|
|
this->Other().shrink_to_fit();
|
|
}
|
|
|
|
/* \brief Merge quantiles from other GPU workers. */
|
|
void AllReduce(Context const* ctx, bool is_column_split);
|
|
/* \brief Create the final histogram cut values. */
|
|
void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split);
|
|
|
|
Span<SketchEntry const> Data() const {
|
|
return {this->Current().data().get(), this->Current().size()};
|
|
}
|
|
HostDeviceVector<FeatureType> const& FeatureTypes() const { return feature_types_; }
|
|
|
|
Span<OffsetT const> ColumnsPtr() const { return this->columns_ptr_.ConstDeviceSpan(); }
|
|
|
|
SketchContainer(SketchContainer&&) = default;
|
|
SketchContainer& operator=(SketchContainer&&) = default;
|
|
|
|
SketchContainer(const SketchContainer&) = delete;
|
|
SketchContainer& operator=(const SketchContainer&) = delete;
|
|
|
|
/* \brief Removes all the duplicated elements in quantile structure. */
|
|
template <typename KeyComp = thrust::equal_to<size_t>>
|
|
size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
|
timer_.Start(__func__);
|
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
|
this->columns_ptr_.SetDevice(device_);
|
|
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
|
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
|
Span<SketchEntry> entries = dh::ToSpan(this->Current());
|
|
HostDeviceVector<OffsetT> scan_out(d_column_scan.size());
|
|
scan_out.SetDevice(device_);
|
|
auto d_scan_out = scan_out.DeviceSpan();
|
|
|
|
d_column_scan = this->columns_ptr_.DeviceSpan();
|
|
size_t n_uniques = dh::SegmentedUnique(
|
|
ctx->CUDACtx()->CTP(), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(),
|
|
entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(),
|
|
detail::SketchUnique{}, key_comp);
|
|
this->columns_ptr_.Copy(scan_out);
|
|
CHECK(!this->columns_ptr_.HostCanRead());
|
|
|
|
this->Current().resize(n_uniques);
|
|
timer_.Stop(__func__);
|
|
return n_uniques;
|
|
}
|
|
};
|
|
} // namespace common
|
|
} // namespace xgboost
|
|
|
|
#endif // XGBOOST_COMMON_QUANTILE_CUH_
|