initial merge

This commit is contained in:
amdsc21
2023-03-25 04:31:55 +01:00
146 changed files with 6730 additions and 4082 deletions

View File

@@ -14,7 +14,7 @@
// clang with libstdc++ works as well
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
!defined(__APPLE__) && __has_include(<omp.h>)
!defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
#define GCC_HAS_PARALLEL 1
#endif // GLIC_VERSION

View File

@@ -121,17 +121,20 @@ namespace dh {
#ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) {
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code);
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str();
}

View File

@@ -2,6 +2,9 @@
* Copyright 2017-2023 XGBoost contributors
*/
#pragma once
#if defined(XGBOOST_USE_CUDA)
#include <thrust/binary_search.h> // thrust::upper_bound
#include <thrust/device_malloc_allocator.h>
#include <thrust/device_ptr.h>
@@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT
}
namespace dh {
#ifdef XGBOOST_USE_NCCL
#ifdef XGBOOST_USE_RCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) {
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code);
ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = hipPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::hip_category()).what();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str();
}

View File

@@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
constexpr StringView LabelScoreSize() {
return "The size of label doesn't match the size of prediction.";
}
constexpr StringView InfInData() {
return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
}
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_

View File

@@ -7,23 +7,22 @@
#ifndef XGBOOST_COMMON_HIST_UTIL_H_
#define XGBOOST_COMMON_HIST_UTIL_H_
#include <xgboost/data.h>
#include <algorithm>
#include <cstdint> // for uint32_t
#include <limits>
#include <map>
#include <memory>
#include <utility>
#include <vector>
#include "algorithm.h" // SegmentId
#include "categorical.h"
#include "common.h"
#include "quantile.h"
#include "row_set.h"
#include "threading_utils.h"
#include "timer.h"
#include "xgboost/base.h" // bst_feature_t, bst_bin_t
#include "xgboost/base.h" // for bst_feature_t, bst_bin_t
#include "xgboost/data.h"
namespace xgboost {
class GHistIndexMatrix;
@@ -392,15 +391,18 @@ class HistCollection {
}
// have we computed a histogram for i-th node?
bool RowExists(bst_uint nid) const {
[[nodiscard]] bool RowExists(bst_uint nid) const {
const uint32_t k_max = std::numeric_limits<uint32_t>::max();
return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
}
// initialize histogram collection
void Init(uint32_t nbins) {
if (nbins_ != nbins) {
nbins_ = nbins;
/**
* \brief Initialize histogram collection.
*
* \param n_total_bins Number of bins across all features.
*/
void Init(std::uint32_t n_total_bins) {
if (nbins_ != n_total_bins) {
nbins_ = n_total_bins;
// quite expensive operation, so let's do this only once
data_.clear();
}

View File

@@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
Json JsonReader::Parse() {
while (true) {
SkipSpaces();
char c = PeekNextChar();
auto c = PeekNextChar();
if (c == -1) { break; }
if (c == '{') {
@@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
}
namespace {
bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
} // anonymous namespace
// Json class
void JsonReader::SkipSpaces() {
while (cursor_.Pos() < raw_str_.size()) {
char c = raw_str_[cursor_.Pos()];
Char c = raw_str_[cursor_.Pos()];
if (IsSpace(c)) {
cursor_.Forward();
} else {
@@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
}
Json JsonReader::ParseString() {
char ch { GetConsecutiveChar('\"') }; // NOLINT
Char ch { GetConsecutiveChar('\"') }; // NOLINT
std::string str;
while (true) {
ch = GetNextChar();
if (ch == '\\') {
char next = static_cast<char>(GetNextChar());
Char next{GetNextChar()};
switch (next) {
case 'r': str += u8"\r"; break;
case 'n': str += u8"\n"; break;
@@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
}
Json JsonReader::ParseNull() {
char ch = GetNextNonSpaceChar();
std::string buffer{ch};
Char ch = GetNextNonSpaceChar();
std::string buffer{static_cast<char>(ch)};
for (size_t i = 0; i < 3; ++i) {
buffer.push_back(GetNextChar());
}
@@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
Json JsonReader::ParseArray() {
std::vector<Json> data;
char ch { GetConsecutiveChar('[') }; // NOLINT
Char ch { GetConsecutiveChar('[') }; // NOLINT
while (true) {
if (PeekNextChar() == ']') {
GetConsecutiveChar(']');
@@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {
Object::Map data;
SkipSpaces();
char ch = PeekNextChar();
auto ch = PeekNextChar();
if (ch == '}') {
GetConsecutiveChar('}');
@@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {
Json JsonReader::ParseBoolean() {
bool result = false;
char ch = GetNextNonSpaceChar();
Char ch = GetNextNonSpaceChar();
std::string const t_value = u8"true";
std::string const f_value = u8"false";
@@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
case 'L':
return ParseTypedArray<I64Array>(n);
default:
LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array."; // NOLINT
LOG(FATAL) << "`" + std::string{static_cast<char>(type)} + // NOLINT
"` is not supported for typed array.";
}
}
std::vector<Json> results;
@@ -794,7 +795,7 @@ Json UBJReader::Load() {
Json UBJReader::Parse() {
while (true) {
char c = PeekNextChar();
auto c = PeekNextChar();
if (c == -1) {
break;
}

View File

@@ -1,13 +1,15 @@
/*!
* Copyright 2022, XGBoost contributors.
/**
* Copyright 2022-2023 by XGBoost contributors.
*/
#ifndef XGBOOST_COMMON_NUMERIC_H_
#define XGBOOST_COMMON_NUMERIC_H_
#include <dmlc/common.h> // OMPException
#include <algorithm> // std::max
#include <iterator> // std::iterator_traits
#include <algorithm> // for std::max
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <iterator> // for iterator_traits
#include <vector>
#include "common.h" // AssertGPUSupport
@@ -15,8 +17,7 @@
#include "xgboost/context.h" // Context
#include "xgboost/host_device_vector.h" // HostDeviceVector
namespace xgboost {
namespace common {
namespace xgboost::common {
/**
* \brief Run length encode on CPU, input must be sorted.
@@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
namespace cpu_impl {
template <typename It, typename V = typename It::value_type>
V Reduce(Context const* ctx, It first, It second, V const& init) {
size_t n = std::distance(first, second);
common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
common::ParallelFor(n, ctx->Threads(),
[&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
std::size_t n = std::distance(first, second);
auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
return result;
}
} // namespace cpu_impl
@@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
});
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_COMMON_NUMERIC_H_

View File

@@ -1,391 +1,386 @@
/*!
* Copyright 2021-2022 by Contributors
* \file row_set.h
* \brief Quick Utility to compute subset of rows
* \author Philip Cho, Tianqi Chen
*/
#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
#define XGBOOST_COMMON_PARTITION_BUILDER_H_
#include <xgboost/data.h>
#include <algorithm>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "../tree/hist/expand_entry.h"
#include "categorical.h"
#include "column_matrix.h"
#include "xgboost/context.h"
#include "xgboost/tree_model.h"
namespace xgboost {
namespace common {
// The builder is required for samples partition to left and rights children for set of nodes
// Responsible for:
// 1) Effective memory allocation for intermediate results for multi-thread work
// 2) Merging partial results produced by threads into original row set (row_set_collection_)
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
template<size_t BlockSize>
class PartitionBuilder {
using BitVector = RBitField8;
public:
template<typename Func>
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
left_right_nodes_sizes_.resize(n_nodes);
blocks_offsets_.resize(n_nodes+1);
blocks_offsets_[0] = 0;
for (size_t i = 1; i < n_nodes+1; ++i) {
blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
}
if (n_tasks > max_n_tasks_) {
mem_blocks_.resize(n_tasks);
max_n_tasks_ = n_tasks;
}
}
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
common::Span<const size_t> row_indices,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
size_t base_rowid, Predicate&& pred) {
auto& column = *p_column;
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
auto p_row_indices = row_indices.data();
auto n_samples = row_indices.size();
for (size_t i = 0; i < n_samples; ++i) {
auto rid = p_row_indices[i];
const int32_t bin_id = column[rid - base_rowid];
if (any_missing && bin_id == ColumnType::kMissingId) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
if (pred(rid, bin_id)) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
return {nleft_elems, nright_elems};
}
template <typename Pred>
inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
Pred pred) {
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
for (auto row_id : ridx) {
if (pred(row_id)) {
p_left_part[nleft_elems++] = row_id;
} else {
p_right_part[nright_elems++] = row_id;
}
}
return {nleft_elems, nright_elems};
}
template <typename BinIdxType, bool any_missing, bool any_cat>
void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
const common::Range1d range,
const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool default_left = tree[nid].DefaultLeft();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
auto pred_hist = [&](auto ridx, auto bin_id) {
if (any_cat && is_cat) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx]);
}
return go_left;
} else {
return bin_id <= split_cond;
}
};
auto pred_approx = [&](auto ridx) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
} else {
CHECK_EQ(any_missing, true);
auto column =
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
}
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
/**
* @brief When data is split by column, we don't have all the features locally on the current
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
if (!column_matrix.IsInitialized()) {
for (auto row_id : rid_span) {
auto gidx = gmat.GetGindex(row_id, fid);
if (gidx > -1) {
bool go_left = false;
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
if (go_left) {
decision_bits->Set(row_id - gmat.base_rowid);
}
} else {
missing_bits->Set(row_id - gmat.base_rowid);
}
}
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
}
/**
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
* use them to partition the rows accordingly.
*/
void PartitionByMask(const size_t node_in_set,
std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
BitVector const& missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();
auto pred_approx = [&](auto ridx) {
bool go_left = default_left;
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
if (!is_missing) {
go_left = decision_bits.Check(ridx - gmat.base_rowid);
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
// allocate thread local memory, should be called for each specific task
void AllocateForTask(size_t id) {
if (mem_blocks_[id].get() == nullptr) {
BlockInfo* local_block_ptr = new BlockInfo;
CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
mem_blocks_[id].reset(local_block_ptr);
}
}
common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Left(), end - begin };
}
common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Right(), end - begin };
}
void SetNLeftElems(int nid, size_t begin, size_t n_left) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_left = n_left;
}
void SetNRightElems(int nid, size_t begin, size_t n_right) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_right = n_right;
}
size_t GetNLeftElems(int nid) const {
return left_right_nodes_sizes_[nid].first;
}
size_t GetNRightElems(int nid) const {
return left_right_nodes_sizes_[nid].second;
}
// Each thread has partial results for some set of tree-nodes
// The function decides order of merging partial results into final row set
void CalculateRowOffsets() {
for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
size_t n_left = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
mem_blocks_[j]->n_offset_left = n_left;
n_left += mem_blocks_[j]->n_left;
}
size_t n_right = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
mem_blocks_[j]->n_offset_right = n_left + n_right;
n_right += mem_blocks_[j]->n_right;
}
left_right_nodes_sizes_[i] = {n_left, n_right};
}
}
void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
size_t task_idx = GetTaskIdx(nid, begin);
size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
const size_t* left = mem_blocks_[task_idx]->Left();
const size_t* right = mem_blocks_[task_idx]->Right();
std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
}
size_t GetTaskIdx(int nid, size_t begin) {
return blocks_offsets_[nid] + begin / BlockSize;
}
// Copy row partitions into global cache for reuse in objective
template <typename Sampledp>
void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
auto& h_pos = *p_position;
h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
auto p_begin = row_set.Data()->data();
ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
auto const& node = row_set[i];
if (node.node_id < 0) {
return;
}
CHECK(tree[node.node_id].IsLeaf());
if (node.begin) { // guard for empty node.
size_t ptr_offset = node.end - p_begin;
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
for (auto idx = node.begin; idx != node.end; ++idx) {
h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
}
}
});
}
protected:
struct BlockInfo{
size_t n_left;
size_t n_right;
size_t n_offset_left;
size_t n_offset_right;
size_t* Left() {
return &left_data_[0];
}
size_t* Right() {
return &right_data_[0];
}
private:
size_t left_data_[BlockSize];
size_t right_data_[BlockSize];
};
std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
std::vector<size_t> blocks_offsets_;
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
size_t max_n_tasks_ = 0;
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_
/**
* Copyright 2021-2023 by Contributors
* \file row_set.h
* \brief Quick Utility to compute subset of rows
* \author Philip Cho, Tianqi Chen
*/
#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
#define XGBOOST_COMMON_PARTITION_BUILDER_H_
#include <xgboost/data.h>
#include <algorithm>
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "../tree/hist/expand_entry.h"
#include "categorical.h"
#include "column_matrix.h"
#include "xgboost/context.h"
#include "xgboost/tree_model.h"
namespace xgboost::common {
// The builder is required for samples partition to left and rights children for set of nodes
// Responsible for:
// 1) Effective memory allocation for intermediate results for multi-thread work
// 2) Merging partial results produced by threads into original row set (row_set_collection_)
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
template<size_t BlockSize>
class PartitionBuilder {
using BitVector = RBitField8;
public:
template<typename Func>
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
left_right_nodes_sizes_.resize(n_nodes);
blocks_offsets_.resize(n_nodes+1);
blocks_offsets_[0] = 0;
for (size_t i = 1; i < n_nodes+1; ++i) {
blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
}
if (n_tasks > max_n_tasks_) {
mem_blocks_.resize(n_tasks);
max_n_tasks_ = n_tasks;
}
}
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
common::Span<const size_t> row_indices,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
size_t base_rowid, Predicate&& pred) {
auto& column = *p_column;
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
auto p_row_indices = row_indices.data();
auto n_samples = row_indices.size();
for (size_t i = 0; i < n_samples; ++i) {
auto rid = p_row_indices[i];
const int32_t bin_id = column[rid - base_rowid];
if (any_missing && bin_id == ColumnType::kMissingId) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
if (pred(rid, bin_id)) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
return {nleft_elems, nright_elems};
}
template <typename Pred>
inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
Pred pred) {
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
for (auto row_id : ridx) {
if (pred(row_id)) {
p_left_part[nleft_elems++] = row_id;
} else {
p_right_part[nright_elems++] = row_id;
}
}
return {nleft_elems, nright_elems};
}
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, const bst_bin_t split_cond,
GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree.SplitIndex(nid);
bool default_left = tree.DefaultLeft(nid);
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
auto pred_hist = [&](auto ridx, auto bin_id) {
if (any_cat && is_cat) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx]);
}
return go_left;
} else {
return bin_id <= split_cond;
}
};
auto pred_approx = [&](auto ridx) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
} else {
CHECK_EQ(any_missing, true);
auto column =
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
}
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
/**
* @brief When data is split by column, we don't have all the features locally on the current
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
template <typename ExpandEntry>
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
if (!column_matrix.IsInitialized()) {
for (auto row_id : rid_span) {
auto gidx = gmat.GetGindex(row_id, fid);
if (gidx > -1) {
bool go_left = false;
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
if (go_left) {
decision_bits->Set(row_id - gmat.base_rowid);
}
} else {
missing_bits->Set(row_id - gmat.base_rowid);
}
}
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
}
/**
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
* use them to partition the rows accordingly.
*/
template <typename ExpandEntry>
void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
BitVector const& missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();
auto pred_approx = [&](auto ridx) {
bool go_left = default_left;
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
if (!is_missing) {
go_left = decision_bits.Check(ridx - gmat.base_rowid);
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
// allocate thread local memory, should be called for each specific task
void AllocateForTask(size_t id) {
if (mem_blocks_[id].get() == nullptr) {
BlockInfo* local_block_ptr = new BlockInfo;
CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
mem_blocks_[id].reset(local_block_ptr);
}
}
common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Left(), end - begin };
}
common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Right(), end - begin };
}
void SetNLeftElems(int nid, size_t begin, size_t n_left) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_left = n_left;
}
void SetNRightElems(int nid, size_t begin, size_t n_right) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_right = n_right;
}
[[nodiscard]] std::size_t GetNLeftElems(int nid) const {
return left_right_nodes_sizes_[nid].first;
}
[[nodiscard]] std::size_t GetNRightElems(int nid) const {
return left_right_nodes_sizes_[nid].second;
}
// Each thread has partial results for some set of tree-nodes
// The function decides order of merging partial results into final row set
void CalculateRowOffsets() {
for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
size_t n_left = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
mem_blocks_[j]->n_offset_left = n_left;
n_left += mem_blocks_[j]->n_left;
}
size_t n_right = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
mem_blocks_[j]->n_offset_right = n_left + n_right;
n_right += mem_blocks_[j]->n_right;
}
left_right_nodes_sizes_[i] = {n_left, n_right};
}
}
void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
size_t task_idx = GetTaskIdx(nid, begin);
size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
const size_t* left = mem_blocks_[task_idx]->Left();
const size_t* right = mem_blocks_[task_idx]->Right();
std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
}
size_t GetTaskIdx(int nid, size_t begin) {
return blocks_offsets_[nid] + begin / BlockSize;
}
// Copy row partitions into global cache for reuse in objective
template <typename Sampledp>
void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
auto& h_pos = *p_position;
h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
auto p_begin = row_set.Data()->data();
ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
auto const& node = row_set[i];
if (node.node_id < 0) {
return;
}
CHECK(tree.IsLeaf(node.node_id));
if (node.begin) { // guard for empty node.
size_t ptr_offset = node.end - p_begin;
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
for (auto idx = node.begin; idx != node.end; ++idx) {
h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
}
}
});
}
protected:
struct BlockInfo{
size_t n_left;
size_t n_right;
size_t n_offset_left;
size_t n_offset_right;
size_t* Left() {
return &left_data_[0];
}
size_t* Right() {
return &right_data_[0];
}
private:
size_t left_data_[BlockSize];
size_t right_data_[BlockSize];
};
std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
std::vector<size_t> blocks_offsets_;
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
size_t max_n_tasks_ = 0;
};
} // namespace xgboost::common
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_

View File

@@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
HistogramCuts *cuts) {
size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
auto &cut_values = cuts->cut_values_.HostVector();
// we use the min_value as the first (0th) element, hence starting from 1.
for (size_t i = 1; i < required_cuts; ++i) {
bst_float cpt = summary.data[i].value;
if (i == 1 || cpt > cut_values.back()) {
@@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
} else {
AddCutPoint<WQSketch>(a, max_num_bins, cuts);
// push a value that is greater than anything
const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
: cuts->min_vals_.HostVector()[fid];
const bst_float cpt =
(a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);

View File

@@ -352,19 +352,6 @@ struct WQSummary {
prev_rmax = data[i].rmax;
}
}
// check consistency of the summary
inline bool Check(const char *msg) const {
const float tol = 10.0f;
for (size_t i = 0; i < this->size; ++i) {
if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
this->Print();
return false;
}
}
return true;
}
};
/*! \brief try to do efficient pruning */

View File

@@ -6,9 +6,7 @@
#include <algorithm> // for copy_n, max, min, none_of, all_of
#include <cstddef> // for size_t
#include <cstdio> // for sscanf
#include <exception> // for exception
#include <functional> // for greater
#include <iterator> // for reverse_iterator
#include <string> // for char_traits, string
#include "algorithm.h" // for ArgSort
@@ -18,12 +16,113 @@
#include "xgboost/base.h" // for bst_group_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/linalg.h" // for All, TensorView, Range, Tensor, Vector
#include "xgboost/logging.h" // for Error, LogCheck_EQ, CHECK_EQ
#include "xgboost/linalg.h" // for All, TensorView, Range
#include "xgboost/logging.h" // for CHECK_EQ
namespace xgboost::ltr {
void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
if (info.group_ptr_.empty()) {
group_ptr_.Resize(2, 0);
group_ptr_.HostVector()[1] = info.num_row_;
} else {
group_ptr_.HostVector() = info.group_ptr_;
}
auto const& gptr = group_ptr_.ConstHostVector();
for (std::size_t i = 1; i < gptr.size(); ++i) {
std::size_t n = gptr[i] - gptr[i - 1];
max_group_size_ = std::max(max_group_size_, n);
}
double sum_weights = 0;
auto n_groups = Groups();
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
for (bst_omp_uint k = 0; k < n_groups; ++k) {
sum_weights += weight[k];
}
weight_norm_ = static_cast<double>(n_groups) / sum_weights;
}
common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
common::Span<float const> predt) {
auto gptr = this->DataGroupPtr(ctx);
auto rank = this->sorted_idx_cache_.HostSpan();
CHECK_EQ(rank.size(), predt.size());
common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
auto cnt = gptr[g + 1] - gptr[g];
auto g_predt = predt.subspan(gptr[g], cnt);
auto g_rank = rank.subspan(gptr[g], cnt);
auto sorted_idx = common::ArgSort<std::size_t>(
ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
CHECK_EQ(g_rank.size(), sorted_idx.size());
std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
});
return rank;
}
#if !defined(XGBOOST_USE_CUDA)
void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
common::Span<float const>) {
common::AssertGPUSupport();
return {};
}
#endif // !defined()
void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
auto const h_group_ptr = this->DataGroupPtr(ctx);
discounts_.Resize(MaxGroupSize(), 0);
auto& h_discounts = discounts_.HostVector();
for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
h_discounts[i] = CalcDCGDiscount(i);
}
auto n_groups = h_group_ptr.size() - 1;
auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
CheckNDCGLabels(this->Param(), h_labels,
[](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
inv_idcg_.Reshape(n_groups);
auto h_inv_idcg = inv_idcg_.HostView();
std::size_t topk = this->Param().TopK();
auto const exp_gain = this->Param().ndcg_exp_gain;
common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
linalg::cend(g_labels), std::greater<>{});
double idcg{0.0};
for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
if (exp_gain) {
idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
} else {
idcg += h_discounts[i] * g_labels(sorted_idx[i]);
}
}
h_inv_idcg(g) = CalcInvIDCG(idcg);
});
}
#if !defined(XGBOOST_USE_CUDA)
void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA)
DMLC_REGISTER_PARAMETER(LambdaRankParam);
void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
}
#if !defined(XGBOOST_USE_CUDA)
void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA)
std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
std::string out_name;
if (!param.empty()) {

212
src/common/ranking_utils.cu Normal file
View File

@@ -0,0 +1,212 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#include <thrust/functional.h> // for maximum
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of, all_of
#include <thrust/pair.h> // for pair, make_pair
#include <thrust/reduce.h> // for reduce
#include <thrust/scan.h> // for inclusive_scan
#include <cstddef> // for size_t
#include "algorithm.cuh" // for SegmentedArgSort
#include "cuda_context.cuh" // for CUDAContext
#include "device_helpers.cuh" // for MakeTransformIterator, LaunchN
#include "optional_weight.h" // for MakeOptionalWeights, OptionalWeights
#include "ranking_utils.cuh" // for ThreadsForMean
#include "ranking_utils.h"
#include "threading_utils.cuh" // for SegmentedTrapezoidThreads
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
#include "xgboost/context.h" // for Context
#include "xgboost/linalg.h" // for VectorView, All, Range
#include "xgboost/logging.h" // for CHECK
#include "xgboost/span.h" // for Span
namespace xgboost::ltr {
namespace cuda_impl {
void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
common::Span<bst_group_t const> d_group_ptr, std::size_t k,
linalg::VectorView<double> out_dcg) {
CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
using IdxGroup = thrust::pair<std::size_t, std::size_t>;
auto group_it = dh::MakeTransformIterator<IdxGroup>(
thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx)); // NOLINT
});
auto value_it = dh::MakeTransformIterator<double>(
group_it,
[exp_gain, d_labels, d_group_ptr, k,
d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
auto g_begin = d_group_ptr[l.second];
auto g_size = d_group_ptr[l.second + 1] - g_begin;
auto idx_in_group = l.first - g_begin;
if (idx_in_group >= k) {
return 0.0;
}
double gain{0.0};
auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
if (exp_gain) {
gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
} else {
gain = g_labels(g_sorted_idx[idx_in_group]);
}
double discount = CalcDCGDiscount(idx_in_group);
return gain * discount;
});
CHECK(out_dcg.Contiguous());
std::size_t bytes;
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
d_group_ptr.size() - 1, d_group_ptr.data(),
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
dh::TemporaryArray<char> temp(bytes);
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
d_group_ptr.size() - 1, d_group_ptr.data(),
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
}
void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
common::Span<bst_group_t const> d_group_ptr,
linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
CHECK_GE(d_group_ptr.size(), 2ul);
size_t n_groups = d_group_ptr.size() - 1;
CHECK_EQ(out_inv_IDCG.Size(), n_groups);
dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
auto d_sorted_idx = dh::ToSpan(sorted_idx);
common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
[out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
double idcg = out_inv_IDCG(idx);
out_inv_IDCG(idx) = CalcInvIDCG(idcg);
});
}
} // namespace cuda_impl
namespace {
struct CheckNDCGOp {
CUDAContext const* cuctx;
template <typename It, typename Op>
bool operator()(It beg, It end, Op op) {
return thrust::none_of(cuctx->CTP(), beg, end, op);
}
};
struct CheckMAPOp {
CUDAContext const* cuctx;
template <typename It, typename Op>
bool operator()(It beg, It end, Op op) {
return thrust::all_of(cuctx->CTP(), beg, end, op);
}
};
struct ThreadGroupOp {
common::Span<bst_group_t const> d_group_ptr;
std::size_t n_pairs;
common::Span<std::size_t> out_thread_group_ptr;
XGBOOST_DEVICE void operator()(std::size_t i) {
out_thread_group_ptr[i + 1] =
cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
}
};
struct GroupSizeOp {
common::Span<bst_group_t const> d_group_ptr;
XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
return d_group_ptr[i + 1] - d_group_ptr[i];
}
};
struct WeightOp {
common::OptionalWeights d_weight;
XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
};
} // anonymous namespace
void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
CUDAContext const* cuctx = ctx->CUDACtx();
group_ptr_.SetDevice(ctx->gpu_id);
if (info.group_ptr_.empty()) {
group_ptr_.Resize(2, 0);
group_ptr_.HostVector()[1] = info.num_row_;
} else {
auto const& h_group_ptr = info.group_ptr_;
group_ptr_.Resize(h_group_ptr.size());
auto d_group_ptr = group_ptr_.DeviceSpan();
dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
cudaMemcpyHostToDevice, cuctx->Stream()));
}
auto d_group_ptr = DataGroupPtr(ctx);
std::size_t n_groups = Groups();
auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
GroupSizeOp{d_group_ptr});
max_group_size_ =
thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
threads_group_ptr_.SetDevice(ctx->gpu_id);
threads_group_ptr_.Resize(n_groups + 1, 0);
auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
if (param_.HasTruncation()) {
n_cuda_threads_ =
common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
} else {
auto n_pairs = Param().NumPair();
dh::LaunchN(n_groups, cuctx->Stream(),
ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
n_cuda_threads_ = info.num_row_ * param_.NumPair();
}
sorted_idx_cache_.SetDevice(ctx->gpu_id);
sorted_idx_cache_.Resize(info.labels.Size(), 0);
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
auto w_it =
dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
}
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
common::Span<float const> predt) {
auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
auto d_group_ptr = DataGroupPtr(ctx);
common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
return d_sorted_idx;
}
void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
CUDAContext const* cuctx = ctx->CUDACtx();
auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
auto d_group_ptr = this->DataGroupPtr(ctx);
std::size_t n_groups = d_group_ptr.size() - 1;
inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
CHECK_GE(this->Param().NumPair(), 1ul);
discounts_.SetDevice(ctx->gpu_id);
discounts_.Resize(MaxGroupSize());
auto d_discount = discounts_.DeviceSpan();
dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
[=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
}
void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
}
} // namespace xgboost::ltr

View File

@@ -0,0 +1,40 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
#define XGBOOST_COMMON_RANKING_UTILS_CUH_
#include <cstddef> // for size_t
#include "ranking_utils.h" // for LambdaRankParam
#include "xgboost/base.h" // for bst_group_t, XGBOOST_DEVICE
#include "xgboost/context.h" // for Context
#include "xgboost/linalg.h" // for VectorView
#include "xgboost/span.h" // for Span
namespace xgboost {
namespace ltr {
namespace cuda_impl {
void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
common::Span<bst_group_t const> d_group_ptr, std::size_t k,
linalg::VectorView<double> out_dcg);
void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
common::Span<bst_group_t const> d_group_ptr,
linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
// Functions for creating number of threads for CUDA, and getting back the number of pairs
// from the number of threads.
XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
std::size_t n_pairs) {
return group_size * n_pairs;
}
XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
std::size_t group_size) {
return n_threads / group_size;
}
} // namespace cuda_impl
} // namespace ltr
} // namespace xgboost
#endif // XGBOOST_COMMON_RANKING_UTILS_CUH_

View File

@@ -11,7 +11,6 @@
#include <string> // for char_traits, string
#include <vector> // for vector
#include "./math.h" // for CloseTo
#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD
#include "error_msg.h" // for GroupWeight, GroupSize
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
@@ -19,7 +18,7 @@
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for Vector, VectorView, Tensor
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK
#include "xgboost/logging.h" // for CHECK_EQ, CHECK
#include "xgboost/parameter.h" // for XGBoostParameter
#include "xgboost/span.h" // for Span
#include "xgboost/string_view.h" // for StringView
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t; // NOLINT
*/
using position_t = std::uint32_t; // NOLINT
/**
* \brief Maximum relevance degree for NDCG
*/
constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
static_assert(MaxRel() == 31);
XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
return static_cast<double>((1u << label) - 1);
}
XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
}
XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg)); // handle irrelevant document
return inv_idcg;
}
enum class PairMethod : std::int32_t {
kTopK = 0,
kMean = 1,
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
.describe("Number of pairs for each sample in the list.");
DMLC_DECLARE_FIELD(lambdarank_unbiased)
.set_default(false)
.describe("Unbiased lambda mart. Use IPW to debias click position");
.describe("Unbiased lambda mart. Use extended IPW to debias click position");
DMLC_DECLARE_FIELD(lambdarank_bias_norm)
.set_default(2.0)
.set_lower_bound(0.0)
@@ -126,6 +144,285 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
}
};
/**
* \brief Common cached items for ranking tasks.
*/
class RankingCache {
private:
void InitOnCPU(Context const* ctx, MetaInfo const& info);
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
// Cached parameter
LambdaRankParam param_;
// offset to data groups.
HostDeviceVector<bst_group_t> group_ptr_;
// store the sorted index of prediction.
HostDeviceVector<std::size_t> sorted_idx_cache_;
// Maximum size of group
std::size_t max_group_size_{0};
// Normalization for weight
double weight_norm_{1.0};
/**
* CUDA cache
*/
// offset to threads assigned to each group for gradient calculation
HostDeviceVector<std::size_t> threads_group_ptr_;
// Sorted index of label for finding buckets.
HostDeviceVector<std::size_t> y_sorted_idx_cache_;
// Cached labels sorted by the model
HostDeviceVector<float> y_ranked_by_model_;
// store rounding factor for objective for each group
linalg::Vector<GradientPair> roundings_;
// rounding factor for cost
HostDeviceVector<double> cost_rounding_;
// temporary storage for creating rounding factors. Stored as byte to avoid having cuda
// data structure in here.
HostDeviceVector<std::uint8_t> max_lambdas_;
// total number of cuda threads used for gradient calculation
std::size_t n_cuda_threads_{0};
// Create model rank list on GPU
common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
common::Span<float const> predt);
// Create model rank list on CPU
common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
common::Span<float const> predt);
protected:
[[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
public:
RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
CHECK(param_.GetInitialised());
if (!info.group_ptr_.empty()) {
CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
<< error::GroupSize() << "the size of label.";
}
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
this->InitOnCUDA(ctx, info);
}
if (!info.weights_.Empty()) {
CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
}
}
[[nodiscard]] std::size_t MaxPositionSize() const {
// Use truncation level as bound.
if (param_.HasTruncation()) {
return param_.NumPair();
}
// Hardcoded maximum size of positions to track. We don't need too many of them as the
// bias decreases exponentially.
return std::min(max_group_size_, static_cast<std::size_t>(32));
}
// Constructed as [1, n_samples] if group ptr is not supplied by the user
common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
group_ptr_.SetDevice(ctx->gpu_id);
return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
}
[[nodiscard]] auto const& Param() const { return param_; }
[[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
[[nodiscard]] double WeightNorm() const { return weight_norm_; }
// Create a rank list by model prediction
common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
if (sorted_idx_cache_.Empty()) {
sorted_idx_cache_.SetDevice(ctx->gpu_id);
sorted_idx_cache_.Resize(predt.size());
}
if (ctx->IsCPU()) {
return this->MakeRankOnCPU(ctx, predt);
} else {
return this->MakeRankOnCUDA(ctx, predt);
}
}
// The function simply returns a uninitialized buffer as this is only used by the
// objective for creating pairs.
common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
CHECK(ctx->IsCUDA());
if (y_sorted_idx_cache_.Empty()) {
y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
y_sorted_idx_cache_.Resize(n_samples);
}
return y_sorted_idx_cache_.DeviceSpan();
}
common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
CHECK(ctx->IsCUDA());
if (y_ranked_by_model_.Empty()) {
y_ranked_by_model_.SetDevice(ctx->gpu_id);
y_ranked_by_model_.Resize(n_samples);
}
return y_ranked_by_model_.DeviceSpan();
}
// CUDA cache getters, the cache is shared between metric and objective, some of these
// fields are lazy initialized to avoid unnecessary allocation.
[[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
CHECK(!threads_group_ptr_.Empty());
return threads_group_ptr_.ConstDeviceSpan();
}
[[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
if (roundings_.Size() == 0) {
roundings_.SetDevice(ctx->gpu_id);
roundings_.Reshape(Groups());
}
return roundings_.View(ctx->gpu_id);
}
common::Span<double> CUDACostRounding(Context const* ctx) {
if (cost_rounding_.Size() == 0) {
cost_rounding_.SetDevice(ctx->gpu_id);
cost_rounding_.Resize(1);
}
return cost_rounding_.DeviceSpan();
}
template <typename Type>
common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
max_lambdas_.SetDevice(ctx->gpu_id);
std::size_t bytes = n * sizeof(Type);
if (bytes != max_lambdas_.Size()) {
max_lambdas_.Resize(bytes);
}
return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
}
};
class NDCGCache : public RankingCache {
// NDCG discount
HostDeviceVector<double> discounts_;
// 1.0 / IDCG
linalg::Vector<double> inv_idcg_;
/**
* CUDA cache
*/
// store the intermediate DCG calculation result for metric
linalg::Vector<double> dcg_;
public:
void InitOnCPU(Context const* ctx, MetaInfo const& info);
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
public:
NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
this->InitOnCUDA(ctx, info);
}
}
linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
return inv_idcg_.View(ctx->gpu_id);
}
common::Span<double const> Discount(Context const* ctx) const {
return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
}
linalg::VectorView<double> Dcg(Context const* ctx) {
if (dcg_.Size() == 0) {
dcg_.SetDevice(ctx->gpu_id);
dcg_.Reshape(this->Groups());
}
return dcg_.View(ctx->gpu_id);
}
};
/**
* \brief Validate label for NDCG
*
* \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
* check for both CPU and GPU.
*/
template <typename NoneOf>
void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
NoneOf none_of) {
auto d_labels = labels.Values();
if (p.ndcg_exp_gain) {
auto label_is_integer =
none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
auto l = std::floor(v);
return std::fabs(l - v) > kRtEps || v < 0.0f;
});
CHECK(label_is_integer)
<< "When using relevance degree as target, label must be either 0 or positive integer.";
}
if (p.ndcg_exp_gain) {
auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
[] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
<< " when the exponential NDCG gain function is used. "
<< "Set `ndcg_exp_gain` to false to use custom DCG gain.";
}
}
template <typename AllOf>
bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
auto s_label = label.Values();
return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
});
}
/**
* \brief Validate label for MAP
*
* \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
* both CPU and GPU.
*/
template <typename AllOf>
void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
auto s_label = label.Values();
auto is_binary = IsBinaryRel(label, all_of);
CHECK(is_binary) << "MAP can only be used with binary labels.";
}
class MAPCache : public RankingCache {
// Total number of relevant documents for each group
HostDeviceVector<double> n_rel_;
// \sum l_k/k
HostDeviceVector<double> acc_;
HostDeviceVector<double> map_;
// Number of samples in this dataset.
std::size_t n_samples_{0};
void InitOnCPU(Context const* ctx, MetaInfo const& info);
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
public:
MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
this->InitOnCUDA(ctx, info);
}
}
common::Span<double> NumRelevant(Context const* ctx) {
if (n_rel_.Empty()) {
n_rel_.SetDevice(ctx->gpu_id);
n_rel_.Resize(n_samples_);
}
return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
}
common::Span<double> Acc(Context const* ctx) {
if (acc_.Empty()) {
acc_.SetDevice(ctx->gpu_id);
acc_.Resize(n_samples_);
}
return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
}
common::Span<double> Map(Context const* ctx) {
if (map_.Empty()) {
map_.SetDevice(ctx->gpu_id);
map_.Resize(this->Groups());
}
return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
}
};
/**
* \brief Parse name for ranking metric given parameters.
*

View File

@@ -8,9 +8,11 @@
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdint> // std::int32_t
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <limits>
#include <type_traits> // std::is_signed
#include <new> // for bad_alloc
#include <type_traits> // for is_signed
#include <vector>
#include "xgboost/logging.h"
@@ -266,7 +268,7 @@ class MemStackAllocator {
if (MaxStackSize >= required_size_) {
ptr_ = stack_mem_;
} else {
ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
}
if (!ptr_) {
throw std::bad_alloc{};
@@ -278,7 +280,7 @@ class MemStackAllocator {
~MemStackAllocator() {
if (required_size_ > MaxStackSize) {
free(ptr_);
std::free(ptr_);
}
}
T& operator[](size_t i) { return ptr_[i]; }