* [coll] Pass context to various functions. In the future, the `Context` object would be required for collective operations, this PR passes the context object to some required functions to prepare for swapping out the implementation.
203 lines
7.0 KiB
Plaintext
203 lines
7.0 KiB
Plaintext
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
|
|
#define XGBOOST_COMMON_QUANTILE_CUH_
|
|
|
|
#include <memory>
|
|
|
|
#include "xgboost/span.h"
|
|
#include "xgboost/data.h"
|
|
#include "device_helpers.cuh"
|
|
#include "quantile.h"
|
|
#include "timer.h"
|
|
#include "categorical.h"
|
|
|
|
namespace xgboost {
|
|
namespace common {
|
|
|
|
class HistogramCuts;
|
|
using WQSketch = WQuantileSketch<bst_float, bst_float>;
|
|
using SketchEntry = WQSketch::Entry;
|
|
|
|
namespace detail {
|
|
struct SketchUnique {
|
|
XGBOOST_DEVICE bool operator()(SketchEntry const& a, SketchEntry const& b) const {
|
|
return a.value - b.value == 0;
|
|
}
|
|
};
|
|
} // namespace detail
|
|
|
|
/*!
|
|
* \brief A container that holds the device sketches. Sketching is performed per-column,
|
|
* but fused into single operation for performance.
|
|
*/
|
|
class SketchContainer {
|
|
public:
|
|
static constexpr float kFactor = WQSketch::kFactor;
|
|
using OffsetT = bst_row_t;
|
|
static_assert(sizeof(OffsetT) == sizeof(size_t), "Wrong type for sketch element offset.");
|
|
|
|
private:
|
|
Monitor timer_;
|
|
HostDeviceVector<FeatureType> feature_types_;
|
|
bst_row_t num_rows_;
|
|
bst_feature_t num_columns_;
|
|
int32_t num_bins_;
|
|
DeviceOrd device_;
|
|
|
|
// Double buffer as neither prune nor merge can be performed inplace.
|
|
dh::device_vector<SketchEntry> entries_a_;
|
|
dh::device_vector<SketchEntry> entries_b_;
|
|
bool current_buffer_ {true};
|
|
// The container is just a CSC matrix.
|
|
HostDeviceVector<OffsetT> columns_ptr_;
|
|
HostDeviceVector<OffsetT> columns_ptr_b_;
|
|
|
|
bool has_categorical_{false};
|
|
|
|
dh::device_vector<SketchEntry>& Current() {
|
|
if (current_buffer_) {
|
|
return entries_a_;
|
|
} else {
|
|
return entries_b_;
|
|
}
|
|
}
|
|
dh::device_vector<SketchEntry>& Other() {
|
|
if (!current_buffer_) {
|
|
return entries_a_;
|
|
} else {
|
|
return entries_b_;
|
|
}
|
|
}
|
|
dh::device_vector<SketchEntry> const& Current() const {
|
|
return const_cast<SketchContainer*>(this)->Current();
|
|
}
|
|
dh::device_vector<SketchEntry> const& Other() const {
|
|
return const_cast<SketchContainer*>(this)->Other();
|
|
}
|
|
void Alternate() {
|
|
current_buffer_ = !current_buffer_;
|
|
}
|
|
|
|
// Get the span of one column.
|
|
Span<SketchEntry> Column(bst_feature_t i) {
|
|
auto data = dh::ToSpan(this->Current());
|
|
auto h_ptr = columns_ptr_.ConstHostSpan();
|
|
auto c = data.subspan(h_ptr[i], h_ptr[i+1] - h_ptr[i]);
|
|
return c;
|
|
}
|
|
|
|
public:
|
|
/* \breif GPU quantile structure, with sketch data for each columns.
|
|
*
|
|
* \param max_bin Maximum number of bins per columns
|
|
* \param num_columns Total number of columns in dataset.
|
|
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
|
* \param device GPU ID.
|
|
*/
|
|
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
|
|
bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
|
|
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
|
CHECK(device.IsCUDA());
|
|
// Initialize Sketches for this dmatrix
|
|
this->columns_ptr_.SetDevice(device_);
|
|
this->columns_ptr_.Resize(num_columns + 1);
|
|
this->columns_ptr_b_.SetDevice(device_);
|
|
this->columns_ptr_b_.Resize(num_columns + 1);
|
|
|
|
this->feature_types_.Resize(feature_types.Size());
|
|
this->feature_types_.Copy(feature_types);
|
|
// Pull to device.
|
|
this->feature_types_.SetDevice(device);
|
|
this->feature_types_.ConstDeviceSpan();
|
|
this->feature_types_.ConstHostSpan();
|
|
|
|
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
|
has_categorical_ =
|
|
!d_feature_types.empty() &&
|
|
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
|
|
|
|
timer_.Init(__func__);
|
|
}
|
|
/* \brief Return GPU ID for this container. */
|
|
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
|
|
/* \brief Whether the predictor matrix contains categorical features. */
|
|
bool HasCategorical() const { return has_categorical_; }
|
|
/* \brief Accumulate weights of duplicated entries in input. */
|
|
size_t ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
|
|
/* Fix rounding error and re-establish invariance. The error is mostly generated by the
|
|
* addition inside `RMinNext` and subtraction in `RMaxPrev`. */
|
|
void FixError();
|
|
|
|
/* \brief Push sorted entries.
|
|
*
|
|
* \param entries Sorted entries.
|
|
* \param columns_ptr CSC pointer for entries.
|
|
* \param cuts_ptr CSC pointer for cuts.
|
|
* \param total_cuts Total number of cuts, equal to the back of cuts_ptr.
|
|
* \param weights (optional) data weights.
|
|
*/
|
|
void Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
|
common::Span<OffsetT> cuts_ptr, size_t total_cuts,
|
|
Span<float> weights = {});
|
|
/* \brief Prune the quantile structure.
|
|
*
|
|
* \param to The maximum size of pruned quantile. If the size of quantile
|
|
* structure is already less than `to`, then no operation is performed.
|
|
*/
|
|
void Prune(size_t to);
|
|
/* \brief Merge another set of sketch.
|
|
* \param that columns of other.
|
|
*/
|
|
void Merge(Span<OffsetT const> that_columns_ptr,
|
|
Span<SketchEntry const> that);
|
|
|
|
/* \brief Merge quantiles from other GPU workers. */
|
|
void AllReduce(Context const* ctx, bool is_column_split);
|
|
/* \brief Create the final histogram cut values. */
|
|
void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split);
|
|
|
|
Span<SketchEntry const> Data() const {
|
|
return {this->Current().data().get(), this->Current().size()};
|
|
}
|
|
HostDeviceVector<FeatureType> const& FeatureTypes() const { return feature_types_; }
|
|
|
|
Span<OffsetT const> ColumnsPtr() const { return this->columns_ptr_.ConstDeviceSpan(); }
|
|
|
|
SketchContainer(SketchContainer&&) = default;
|
|
SketchContainer& operator=(SketchContainer&&) = default;
|
|
|
|
SketchContainer(const SketchContainer&) = delete;
|
|
SketchContainer& operator=(const SketchContainer&) = delete;
|
|
|
|
/* \brief Removes all the duplicated elements in quantile structure. */
|
|
template <typename KeyComp = thrust::equal_to<size_t>>
|
|
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
|
timer_.Start(__func__);
|
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
|
this->columns_ptr_.SetDevice(device_);
|
|
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
|
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
|
Span<SketchEntry> entries = dh::ToSpan(this->Current());
|
|
HostDeviceVector<OffsetT> scan_out(d_column_scan.size());
|
|
scan_out.SetDevice(device_);
|
|
auto d_scan_out = scan_out.DeviceSpan();
|
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
|
|
|
d_column_scan = this->columns_ptr_.DeviceSpan();
|
|
size_t n_uniques = dh::SegmentedUnique(
|
|
thrust::cuda::par(alloc), d_column_scan.data(),
|
|
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
|
entries.data() + entries.size(), scan_out.DevicePointer(),
|
|
entries.data(), detail::SketchUnique{}, key_comp);
|
|
this->columns_ptr_.Copy(scan_out);
|
|
CHECK(!this->columns_ptr_.HostCanRead());
|
|
|
|
this->Current().resize(n_uniques);
|
|
timer_.Stop(__func__);
|
|
return n_uniques;
|
|
}
|
|
};
|
|
} // namespace common
|
|
} // namespace xgboost
|
|
|
|
#endif // XGBOOST_COMMON_QUANTILE_CUH_
|