Rework the precision metric. (#9222)
- Rework the precision metric for both CPU and GPU. - Mention it in the document. - Cleanup old support code for GPU ranking metric. - Deterministic GPU implementation. * Drop support for classification. * type. * use batch shape. * lint. * cpu build. * cpu build. * lint. * Tests. * Fix. * Cleanup error message.
This commit is contained in:
@@ -825,176 +825,6 @@ XGBOOST_DEVICE auto tcrend(xgboost::common::Span<T> const &span) { // NOLINT
|
||||
return tcrbegin(span) + span.size();
|
||||
}
|
||||
|
||||
// This type sorts an array which is divided into multiple groups. The sorting is influenced
|
||||
// by the function object 'Comparator'
|
||||
template <typename T>
|
||||
class SegmentSorter {
|
||||
private:
|
||||
// Items sorted within the group
|
||||
caching_device_vector<T> ditems_;
|
||||
|
||||
// Original position of the items before they are sorted descending within their groups
|
||||
caching_device_vector<uint32_t> doriginal_pos_;
|
||||
|
||||
// Segments within the original list that delineates the different groups
|
||||
caching_device_vector<uint32_t> group_segments_;
|
||||
|
||||
// Need this on the device as it is used in the kernels
|
||||
caching_device_vector<uint32_t> dgroups_; // Group information on device
|
||||
|
||||
// Where did the item that was originally present at position 'x' move to after they are sorted
|
||||
caching_device_vector<uint32_t> dindexable_sorted_pos_;
|
||||
|
||||
// Initialize everything but the segments
|
||||
void Init(uint32_t num_elems) {
|
||||
ditems_.resize(num_elems);
|
||||
|
||||
doriginal_pos_.resize(num_elems);
|
||||
thrust::sequence(doriginal_pos_.begin(), doriginal_pos_.end());
|
||||
}
|
||||
|
||||
// Initialize all with group info
|
||||
void Init(const std::vector<uint32_t> &groups) {
|
||||
uint32_t num_elems = groups.back();
|
||||
this->Init(num_elems);
|
||||
this->CreateGroupSegments(groups);
|
||||
}
|
||||
|
||||
public:
|
||||
// This needs to be public due to device lambda
|
||||
void CreateGroupSegments(const std::vector<uint32_t> &groups) {
|
||||
uint32_t num_elems = groups.back();
|
||||
group_segments_.resize(num_elems, 0);
|
||||
|
||||
dgroups_ = groups;
|
||||
|
||||
if (GetNumGroups() == 1) return; // There are no segments; hence, no need to compute them
|
||||
|
||||
// Define the segments by assigning a group ID to each element
|
||||
const uint32_t *dgroups = dgroups_.data().get();
|
||||
uint32_t ngroups = dgroups_.size();
|
||||
auto ComputeGroupIDLambda = [=] __device__(uint32_t idx) {
|
||||
return thrust::upper_bound(thrust::seq, dgroups, dgroups + ngroups, idx) -
|
||||
dgroups - 1;
|
||||
}; // NOLINT
|
||||
|
||||
thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
|
||||
thrust::make_counting_iterator(num_elems),
|
||||
group_segments_.begin(),
|
||||
ComputeGroupIDLambda);
|
||||
}
|
||||
|
||||
// Accessors that returns device pointer
|
||||
inline uint32_t GetNumItems() const { return ditems_.size(); }
|
||||
inline const xgboost::common::Span<const T> GetItemsSpan() const {
|
||||
return { ditems_.data().get(), ditems_.size() };
|
||||
}
|
||||
|
||||
inline const xgboost::common::Span<const uint32_t> GetOriginalPositionsSpan() const {
|
||||
return { doriginal_pos_.data().get(), doriginal_pos_.size() };
|
||||
}
|
||||
|
||||
inline const xgboost::common::Span<const uint32_t> GetGroupSegmentsSpan() const {
|
||||
return { group_segments_.data().get(), group_segments_.size() };
|
||||
}
|
||||
|
||||
inline uint32_t GetNumGroups() const { return dgroups_.size() - 1; }
|
||||
inline const xgboost::common::Span<const uint32_t> GetGroupsSpan() const {
|
||||
return { dgroups_.data().get(), dgroups_.size() };
|
||||
}
|
||||
|
||||
inline const xgboost::common::Span<const uint32_t> GetIndexableSortedPositionsSpan() const {
|
||||
return { dindexable_sorted_pos_.data().get(), dindexable_sorted_pos_.size() };
|
||||
}
|
||||
|
||||
// Sort an array that is divided into multiple groups. The array is sorted within each group.
|
||||
// This version provides the group information that is on the host.
|
||||
// The array is sorted based on an adaptable binary predicate. By default a stateless predicate
|
||||
// is used.
|
||||
template <typename Comparator = thrust::greater<T>>
|
||||
void SortItems(const T *ditems, uint32_t item_size, const std::vector<uint32_t> &groups,
|
||||
const Comparator &comp = Comparator()) {
|
||||
this->Init(groups);
|
||||
this->SortItems(ditems, item_size, this->GetGroupSegmentsSpan(), comp);
|
||||
}
|
||||
|
||||
// Sort an array that is divided into multiple groups. The array is sorted within each group.
|
||||
// This version provides the group information that is on the device.
|
||||
// The array is sorted based on an adaptable binary predicate. By default a stateless predicate
|
||||
// is used.
|
||||
template <typename Comparator = thrust::greater<T>>
|
||||
void SortItems(const T *ditems, uint32_t item_size,
|
||||
const xgboost::common::Span<const uint32_t> &group_segments,
|
||||
const Comparator &comp = Comparator()) {
|
||||
this->Init(item_size);
|
||||
|
||||
// Sort the items that are grouped. We would like to avoid using predicates to perform the sort,
|
||||
// as thrust resorts to using a merge sort as opposed to a much much faster radix sort
|
||||
// when comparators are used. Hence, the following algorithm is used. This is done so that
|
||||
// we can grab the appropriate related values from the original list later, after the
|
||||
// items are sorted.
|
||||
//
|
||||
// Here is the internal representation:
|
||||
// dgroups_: [ 0, 3, 5, 8, 10 ]
|
||||
// group_segments_: 0 0 0 | 1 1 | 2 2 2 | 3 3
|
||||
// doriginal_pos_: 0 1 2 | 3 4 | 5 6 7 | 8 9
|
||||
// ditems_: 1 0 1 | 2 1 | 1 3 3 | 4 4 (from original items)
|
||||
//
|
||||
// Sort the items first and make a note of the original positions in doriginal_pos_
|
||||
// based on the sort
|
||||
// ditems_: 4 4 3 3 2 1 1 1 1 0
|
||||
// doriginal_pos_: 8 9 6 7 3 0 2 4 5 1
|
||||
// NOTE: This consumes space, but is much faster than some of the other approaches - sorting
|
||||
// in kernel, sorting using predicates etc.
|
||||
|
||||
ditems_.assign(thrust::device_ptr<const T>(ditems),
|
||||
thrust::device_ptr<const T>(ditems) + item_size);
|
||||
|
||||
// Allocator to be used by sort for managing space overhead while sorting
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
thrust::stable_sort_by_key(thrust::cuda::par(alloc),
|
||||
ditems_.begin(), ditems_.end(),
|
||||
doriginal_pos_.begin(), comp);
|
||||
|
||||
if (GetNumGroups() == 1) return; // The entire array is sorted, as it isn't segmented
|
||||
|
||||
// Next, gather the segments based on the doriginal_pos_. This is to reflect the
|
||||
// holisitic item sort order on the segments
|
||||
// group_segments_c_: 3 3 2 2 1 0 0 1 2 0
|
||||
// doriginal_pos_: 8 9 6 7 3 0 2 4 5 1 (stays the same)
|
||||
caching_device_vector<uint32_t> group_segments_c(item_size);
|
||||
thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
|
||||
dh::tcbegin(group_segments), group_segments_c.begin());
|
||||
|
||||
// Now, sort the group segments so that you may bring the items within the group together,
|
||||
// in the process also noting the relative changes to the doriginal_pos_ while that happens
|
||||
// group_segments_c_: 0 0 0 1 1 2 2 2 3 3
|
||||
// doriginal_pos_: 0 2 1 3 4 6 7 5 8 9
|
||||
thrust::stable_sort_by_key(thrust::cuda::par(alloc),
|
||||
group_segments_c.begin(), group_segments_c.end(),
|
||||
doriginal_pos_.begin(), thrust::less<uint32_t>());
|
||||
|
||||
// Finally, gather the original items based on doriginal_pos_ to sort the input and
|
||||
// to store them in ditems_
|
||||
// doriginal_pos_: 0 2 1 3 4 6 7 5 8 9 (stays the same)
|
||||
// ditems_: 1 1 0 2 1 3 3 1 4 4 (from unsorted items - ditems)
|
||||
thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
|
||||
thrust::device_ptr<const T>(ditems), ditems_.begin());
|
||||
}
|
||||
|
||||
// Determine where an item that was originally present at position 'x' has been relocated to
|
||||
// after a sort. Creation of such an index has to be explicitly requested after a sort
|
||||
void CreateIndexableSortedPositions() {
|
||||
dindexable_sorted_pos_.resize(GetNumItems());
|
||||
thrust::scatter(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
|
||||
thrust::make_counting_iterator(GetNumItems()), // Rearrange indices...
|
||||
// ...based on this map
|
||||
dh::tcbegin(GetOriginalPositionsSpan()),
|
||||
dindexable_sorted_pos_.begin()); // Write results into this
|
||||
}
|
||||
};
|
||||
|
||||
// Atomic add function for gradients
|
||||
template <typename OutputGradientT, typename InputGradientT>
|
||||
XGBOOST_DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
|
||||
|
||||
@@ -8,8 +8,7 @@
|
||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||
#include "xgboost/span.h" // Span
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace xgboost::common {
|
||||
struct OptionalWeights {
|
||||
Span<float const> weights;
|
||||
float dft{1.0f}; // fixme: make this compile time constant
|
||||
@@ -18,7 +17,8 @@ struct OptionalWeights {
|
||||
explicit OptionalWeights(float w) : dft{w} {}
|
||||
|
||||
XGBOOST_DEVICE float operator[](size_t i) const { return weights.empty() ? dft : weights[i]; }
|
||||
auto Empty() const { return weights.empty(); }
|
||||
[[nodiscard]] auto Empty() const { return weights.empty(); }
|
||||
[[nodiscard]] auto Size() const { return weights.size(); }
|
||||
};
|
||||
|
||||
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
|
||||
@@ -28,6 +28,5 @@ inline OptionalWeights MakeOptionalWeights(Context const* ctx,
|
||||
}
|
||||
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
#endif // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
|
||||
|
||||
@@ -90,6 +90,9 @@ void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid
|
||||
MetaInfo const &info, float missing) {
|
||||
auto const &h_weights =
|
||||
(use_group_ind_ ? detail::UnrollGroupWeights(info) : info.weights_.HostVector());
|
||||
if (!use_group_ind_ && !h_weights.empty()) {
|
||||
CHECK_EQ(h_weights.size(), batch.Size()) << "Invalid size of sample weight.";
|
||||
}
|
||||
|
||||
auto is_valid = data::IsValidFunctor{missing};
|
||||
auto weights = OptionalWeights{Span<float const>{h_weights}};
|
||||
|
||||
@@ -19,12 +19,12 @@
|
||||
|
||||
#include "categorical.h"
|
||||
#include "common.h"
|
||||
#include "error_msg.h" // GroupWeight
|
||||
#include "optional_weight.h" // OptionalWeights
|
||||
#include "threading_utils.h"
|
||||
#include "timer.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace xgboost::common {
|
||||
/*!
|
||||
* \brief experimental wsummary
|
||||
* \tparam DType type of data content
|
||||
@@ -695,13 +695,18 @@ inline std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
|
||||
return group_weights;
|
||||
}
|
||||
|
||||
size_t n_samples = info.num_row_;
|
||||
auto const &group_ptr = info.group_ptr_;
|
||||
std::vector<float> results(n_samples);
|
||||
CHECK_GE(group_ptr.size(), 2);
|
||||
CHECK_EQ(group_ptr.back(), n_samples);
|
||||
|
||||
auto n_groups = group_ptr.size() - 1;
|
||||
CHECK_EQ(info.weights_.Size(), n_groups) << error::GroupWeight();
|
||||
|
||||
bst_row_t n_samples = info.num_row_;
|
||||
std::vector<float> results(n_samples);
|
||||
CHECK_EQ(group_ptr.back(), n_samples)
|
||||
<< error::GroupSize() << " the number of rows from the data.";
|
||||
size_t cur_group = 0;
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
for (bst_row_t i = 0; i < n_samples; ++i) {
|
||||
results[i] = group_weights[cur_group];
|
||||
if (i == group_ptr[cur_group + 1]) {
|
||||
cur_group++;
|
||||
@@ -1010,6 +1015,5 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
|
||||
*/
|
||||
void PushColPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian);
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
#endif // XGBOOST_COMMON_QUANTILE_H_
|
||||
|
||||
@@ -114,9 +114,20 @@ void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUS
|
||||
|
||||
DMLC_REGISTER_PARAMETER(LambdaRankParam);
|
||||
|
||||
void PreCache::InitOnCPU(Context const*, MetaInfo const& info) {
|
||||
auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
|
||||
CheckPreLabels("pre", h_label,
|
||||
[](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
void PreCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
|
||||
auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
|
||||
CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
|
||||
CheckPreLabels("map", h_label,
|
||||
[](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
@@ -205,8 +205,13 @@ void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
[=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
|
||||
}
|
||||
|
||||
void PreCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
|
||||
CheckPreLabels("pre", d_label, CheckMAPOp{ctx->CUDACtx()});
|
||||
}
|
||||
|
||||
void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
|
||||
CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
|
||||
CheckPreLabels("map", d_label, CheckMAPOp{ctx->CUDACtx()});
|
||||
}
|
||||
} // namespace xgboost::ltr
|
||||
|
||||
@@ -366,18 +366,43 @@ bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
|
||||
});
|
||||
}
|
||||
/**
|
||||
* \brief Validate label for MAP
|
||||
* \brief Validate label for precision-based metric.
|
||||
*
|
||||
* \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
|
||||
* both CPU and GPU.
|
||||
*/
|
||||
template <typename AllOf>
|
||||
void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
|
||||
void CheckPreLabels(StringView name, linalg::VectorView<float const> label, AllOf all_of) {
|
||||
auto s_label = label.Values();
|
||||
auto is_binary = IsBinaryRel(label, all_of);
|
||||
CHECK(is_binary) << "MAP can only be used with binary labels.";
|
||||
CHECK(is_binary) << name << " can only be used with binary labels.";
|
||||
}
|
||||
|
||||
class PreCache : public RankingCache {
|
||||
HostDeviceVector<double> pre_;
|
||||
|
||||
void InitOnCPU(Context const* ctx, MetaInfo const& info);
|
||||
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
|
||||
|
||||
public:
|
||||
PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
|
||||
: RankingCache{ctx, info, p} {
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
}
|
||||
}
|
||||
|
||||
common::Span<double> Pre(Context const* ctx) {
|
||||
if (pre_.Empty()) {
|
||||
pre_.SetDevice(ctx->gpu_id);
|
||||
pre_.Resize(this->Groups());
|
||||
}
|
||||
return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
|
||||
}
|
||||
};
|
||||
|
||||
class MAPCache : public RankingCache {
|
||||
// Total number of relevant documents for each group
|
||||
HostDeviceVector<double> n_rel_;
|
||||
|
||||
Reference in New Issue
Block a user