initial merge
This commit is contained in:
@@ -14,7 +14,7 @@
|
||||
|
||||
// clang with libstdc++ works as well
|
||||
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
|
||||
!defined(__APPLE__) && __has_include(<omp.h>)
|
||||
!defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
|
||||
#define GCC_HAS_PARALLEL 1
|
||||
#endif // GLIC_VERSION
|
||||
|
||||
|
||||
@@ -121,17 +121,20 @@ namespace dh {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code);
|
||||
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
|
||||
ss << " " << file << "(" << line << ")\n";
|
||||
if (code == ncclUnhandledCudaError) {
|
||||
// nccl usually preserves the last error so we can get more details.
|
||||
auto err = cudaPeekAtLastError();
|
||||
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
|
||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||
} else if (code == ncclSystemError) {
|
||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||
"the network interface for NCCL via environment variables listed in its reference: "
|
||||
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
||||
}
|
||||
ss << " " << file << "(" << line << ")";
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
|
||||
#include <thrust/binary_search.h> // thrust::upper_bound
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
@@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT
|
||||
}
|
||||
namespace dh {
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#ifdef XGBOOST_USE_RCCL
|
||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << "NCCL failure :" << ncclGetErrorString(code);
|
||||
ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
|
||||
ss << " " << file << "(" << line << ")\n";
|
||||
if (code == ncclUnhandledCudaError) {
|
||||
// nccl usually preserves the last error so we can get more details.
|
||||
auto err = hipPeekAtLastError();
|
||||
ss << " " << thrust::system_error(err, thrust::hip_category()).what();
|
||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||
} else if (code == ncclSystemError) {
|
||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||
"the network interface for NCCL via environment variables listed in its reference: "
|
||||
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
||||
}
|
||||
ss << " " << file << "(" << line << ")";
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
|
||||
@@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
|
||||
constexpr StringView LabelScoreSize() {
|
||||
return "The size of label doesn't match the size of prediction.";
|
||||
}
|
||||
|
||||
constexpr StringView InfInData() {
|
||||
return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
|
||||
}
|
||||
} // namespace xgboost::error
|
||||
#endif // XGBOOST_COMMON_ERROR_MSG_H_
|
||||
|
||||
@@ -7,23 +7,22 @@
|
||||
#ifndef XGBOOST_COMMON_HIST_UTIL_H_
|
||||
#define XGBOOST_COMMON_HIST_UTIL_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint> // for uint32_t
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "algorithm.h" // SegmentId
|
||||
#include "categorical.h"
|
||||
#include "common.h"
|
||||
#include "quantile.h"
|
||||
#include "row_set.h"
|
||||
#include "threading_utils.h"
|
||||
#include "timer.h"
|
||||
#include "xgboost/base.h" // bst_feature_t, bst_bin_t
|
||||
#include "xgboost/base.h" // for bst_feature_t, bst_bin_t
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost {
|
||||
class GHistIndexMatrix;
|
||||
@@ -392,15 +391,18 @@ class HistCollection {
|
||||
}
|
||||
|
||||
// have we computed a histogram for i-th node?
|
||||
bool RowExists(bst_uint nid) const {
|
||||
[[nodiscard]] bool RowExists(bst_uint nid) const {
|
||||
const uint32_t k_max = std::numeric_limits<uint32_t>::max();
|
||||
return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
|
||||
}
|
||||
|
||||
// initialize histogram collection
|
||||
void Init(uint32_t nbins) {
|
||||
if (nbins_ != nbins) {
|
||||
nbins_ = nbins;
|
||||
/**
|
||||
* \brief Initialize histogram collection.
|
||||
*
|
||||
* \param n_total_bins Number of bins across all features.
|
||||
*/
|
||||
void Init(std::uint32_t n_total_bins) {
|
||||
if (nbins_ != n_total_bins) {
|
||||
nbins_ = n_total_bins;
|
||||
// quite expensive operation, so let's do this only once
|
||||
data_.clear();
|
||||
}
|
||||
|
||||
@@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
|
||||
Json JsonReader::Parse() {
|
||||
while (true) {
|
||||
SkipSpaces();
|
||||
char c = PeekNextChar();
|
||||
auto c = PeekNextChar();
|
||||
if (c == -1) { break; }
|
||||
|
||||
if (c == '{') {
|
||||
@@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
|
||||
}
|
||||
|
||||
namespace {
|
||||
bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
|
||||
bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
|
||||
} // anonymous namespace
|
||||
|
||||
// Json class
|
||||
void JsonReader::SkipSpaces() {
|
||||
while (cursor_.Pos() < raw_str_.size()) {
|
||||
char c = raw_str_[cursor_.Pos()];
|
||||
Char c = raw_str_[cursor_.Pos()];
|
||||
if (IsSpace(c)) {
|
||||
cursor_.Forward();
|
||||
} else {
|
||||
@@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
|
||||
}
|
||||
|
||||
Json JsonReader::ParseString() {
|
||||
char ch { GetConsecutiveChar('\"') }; // NOLINT
|
||||
Char ch { GetConsecutiveChar('\"') }; // NOLINT
|
||||
std::string str;
|
||||
while (true) {
|
||||
ch = GetNextChar();
|
||||
if (ch == '\\') {
|
||||
char next = static_cast<char>(GetNextChar());
|
||||
Char next{GetNextChar()};
|
||||
switch (next) {
|
||||
case 'r': str += u8"\r"; break;
|
||||
case 'n': str += u8"\n"; break;
|
||||
@@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
|
||||
}
|
||||
|
||||
Json JsonReader::ParseNull() {
|
||||
char ch = GetNextNonSpaceChar();
|
||||
std::string buffer{ch};
|
||||
Char ch = GetNextNonSpaceChar();
|
||||
std::string buffer{static_cast<char>(ch)};
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
buffer.push_back(GetNextChar());
|
||||
}
|
||||
@@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
|
||||
Json JsonReader::ParseArray() {
|
||||
std::vector<Json> data;
|
||||
|
||||
char ch { GetConsecutiveChar('[') }; // NOLINT
|
||||
Char ch { GetConsecutiveChar('[') }; // NOLINT
|
||||
while (true) {
|
||||
if (PeekNextChar() == ']') {
|
||||
GetConsecutiveChar(']');
|
||||
@@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {
|
||||
|
||||
Object::Map data;
|
||||
SkipSpaces();
|
||||
char ch = PeekNextChar();
|
||||
auto ch = PeekNextChar();
|
||||
|
||||
if (ch == '}') {
|
||||
GetConsecutiveChar('}');
|
||||
@@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {
|
||||
|
||||
Json JsonReader::ParseBoolean() {
|
||||
bool result = false;
|
||||
char ch = GetNextNonSpaceChar();
|
||||
Char ch = GetNextNonSpaceChar();
|
||||
std::string const t_value = u8"true";
|
||||
std::string const f_value = u8"false";
|
||||
|
||||
@@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
|
||||
case 'L':
|
||||
return ParseTypedArray<I64Array>(n);
|
||||
default:
|
||||
LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array."; // NOLINT
|
||||
LOG(FATAL) << "`" + std::string{static_cast<char>(type)} + // NOLINT
|
||||
"` is not supported for typed array.";
|
||||
}
|
||||
}
|
||||
std::vector<Json> results;
|
||||
@@ -794,7 +795,7 @@ Json UBJReader::Load() {
|
||||
|
||||
Json UBJReader::Parse() {
|
||||
while (true) {
|
||||
char c = PeekNextChar();
|
||||
auto c = PeekNextChar();
|
||||
if (c == -1) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
/*!
|
||||
* Copyright 2022, XGBoost contributors.
|
||||
/**
|
||||
* Copyright 2022-2023 by XGBoost contributors.
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_NUMERIC_H_
|
||||
#define XGBOOST_COMMON_NUMERIC_H_
|
||||
|
||||
#include <dmlc/common.h> // OMPException
|
||||
|
||||
#include <algorithm> // std::max
|
||||
#include <iterator> // std::iterator_traits
|
||||
#include <algorithm> // for std::max
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t
|
||||
#include <iterator> // for iterator_traits
|
||||
#include <vector>
|
||||
|
||||
#include "common.h" // AssertGPUSupport
|
||||
@@ -15,8 +17,7 @@
|
||||
#include "xgboost/context.h" // Context
|
||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace xgboost::common {
|
||||
|
||||
/**
|
||||
* \brief Run length encode on CPU, input must be sorted.
|
||||
@@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
|
||||
namespace cpu_impl {
|
||||
template <typename It, typename V = typename It::value_type>
|
||||
V Reduce(Context const* ctx, It first, It second, V const& init) {
|
||||
size_t n = std::distance(first, second);
|
||||
common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
|
||||
common::ParallelFor(n, ctx->Threads(),
|
||||
[&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
|
||||
auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
|
||||
std::size_t n = std::distance(first, second);
|
||||
auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
|
||||
common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
|
||||
common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
|
||||
auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
|
||||
return result;
|
||||
}
|
||||
} // namespace cpu_impl
|
||||
@@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
|
||||
});
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
|
||||
#endif // XGBOOST_COMMON_NUMERIC_H_
|
||||
|
||||
@@ -1,391 +1,386 @@
|
||||
/*!
|
||||
* Copyright 2021-2022 by Contributors
|
||||
* \file row_set.h
|
||||
* \brief Quick Utility to compute subset of rows
|
||||
* \author Philip Cho, Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
|
||||
#define XGBOOST_COMMON_PARTITION_BUILDER_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "../tree/hist/expand_entry.h"
|
||||
#include "categorical.h"
|
||||
#include "column_matrix.h"
|
||||
#include "xgboost/context.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
// The builder is required for samples partition to left and rights children for set of nodes
|
||||
// Responsible for:
|
||||
// 1) Effective memory allocation for intermediate results for multi-thread work
|
||||
// 2) Merging partial results produced by threads into original row set (row_set_collection_)
|
||||
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
|
||||
template<size_t BlockSize>
|
||||
class PartitionBuilder {
|
||||
using BitVector = RBitField8;
|
||||
|
||||
public:
|
||||
template<typename Func>
|
||||
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
|
||||
left_right_nodes_sizes_.resize(n_nodes);
|
||||
blocks_offsets_.resize(n_nodes+1);
|
||||
|
||||
blocks_offsets_[0] = 0;
|
||||
for (size_t i = 1; i < n_nodes+1; ++i) {
|
||||
blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
|
||||
}
|
||||
|
||||
if (n_tasks > max_n_tasks_) {
|
||||
mem_blocks_.resize(n_tasks);
|
||||
max_n_tasks_ = n_tasks;
|
||||
}
|
||||
}
|
||||
|
||||
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
|
||||
// on comparison of indexes values (idx_span) and split point (split_cond)
|
||||
// Handle dense columns
|
||||
// Analog of std::stable_partition, but in no-inplace manner
|
||||
template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
|
||||
inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
|
||||
common::Span<const size_t> row_indices,
|
||||
common::Span<size_t> left_part,
|
||||
common::Span<size_t> right_part,
|
||||
size_t base_rowid, Predicate&& pred) {
|
||||
auto& column = *p_column;
|
||||
size_t* p_left_part = left_part.data();
|
||||
size_t* p_right_part = right_part.data();
|
||||
size_t nleft_elems = 0;
|
||||
size_t nright_elems = 0;
|
||||
|
||||
auto p_row_indices = row_indices.data();
|
||||
auto n_samples = row_indices.size();
|
||||
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
auto rid = p_row_indices[i];
|
||||
const int32_t bin_id = column[rid - base_rowid];
|
||||
if (any_missing && bin_id == ColumnType::kMissingId) {
|
||||
if (default_left) {
|
||||
p_left_part[nleft_elems++] = rid;
|
||||
} else {
|
||||
p_right_part[nright_elems++] = rid;
|
||||
}
|
||||
} else {
|
||||
if (pred(rid, bin_id)) {
|
||||
p_left_part[nleft_elems++] = rid;
|
||||
} else {
|
||||
p_right_part[nright_elems++] = rid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {nleft_elems, nright_elems};
|
||||
}
|
||||
|
||||
template <typename Pred>
|
||||
inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
|
||||
common::Span<size_t> left_part,
|
||||
common::Span<size_t> right_part,
|
||||
Pred pred) {
|
||||
size_t* p_left_part = left_part.data();
|
||||
size_t* p_right_part = right_part.data();
|
||||
size_t nleft_elems = 0;
|
||||
size_t nright_elems = 0;
|
||||
for (auto row_id : ridx) {
|
||||
if (pred(row_id)) {
|
||||
p_left_part[nleft_elems++] = row_id;
|
||||
} else {
|
||||
p_right_part[nright_elems++] = row_id;
|
||||
}
|
||||
}
|
||||
return {nleft_elems, nright_elems};
|
||||
}
|
||||
|
||||
template <typename BinIdxType, bool any_missing, bool any_cat>
|
||||
void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
|
||||
const common::Range1d range,
|
||||
const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix,
|
||||
const RegTree& tree, const size_t* rid) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
|
||||
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bst_feature_t fid = tree[nid].SplitIndex();
|
||||
bool default_left = tree[nid].DefaultLeft();
|
||||
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
|
||||
auto node_cats = tree.NodeCats(nid);
|
||||
auto const& cut_values = gmat.cut.Values();
|
||||
|
||||
auto pred_hist = [&](auto ridx, auto bin_id) {
|
||||
if (any_cat && is_cat) {
|
||||
auto gidx = gmat.GetGindex(ridx, fid);
|
||||
bool go_left = default_left;
|
||||
if (gidx > -1) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
}
|
||||
return go_left;
|
||||
} else {
|
||||
return bin_id <= split_cond;
|
||||
}
|
||||
};
|
||||
|
||||
auto pred_approx = [&](auto ridx) {
|
||||
auto gidx = gmat.GetGindex(ridx, fid);
|
||||
bool go_left = default_left;
|
||||
if (gidx > -1) {
|
||||
if (is_cat) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
} else {
|
||||
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
||||
}
|
||||
}
|
||||
return go_left;
|
||||
};
|
||||
|
||||
std::pair<size_t, size_t> child_nodes_sizes;
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
|
||||
} else {
|
||||
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
|
||||
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
|
||||
if (default_left) {
|
||||
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
} else {
|
||||
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
}
|
||||
} else {
|
||||
CHECK_EQ(any_missing, true);
|
||||
auto column =
|
||||
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
|
||||
if (default_left) {
|
||||
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
} else {
|
||||
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const size_t n_left = child_nodes_sizes.first;
|
||||
const size_t n_right = child_nodes_sizes.second;
|
||||
|
||||
SetNLeftElems(node_in_set, range.begin(), n_left);
|
||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief When data is split by column, we don't have all the features locally on the current
|
||||
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
|
||||
* to go right, or if the feature value used for the split is missing.
|
||||
*/
|
||||
void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
|
||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix,
|
||||
const RegTree& tree, const size_t* rid,
|
||||
BitVector* decision_bits, BitVector* missing_bits) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bst_feature_t fid = tree[nid].SplitIndex();
|
||||
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
|
||||
auto node_cats = tree.NodeCats(nid);
|
||||
auto const& cut_values = gmat.cut.Values();
|
||||
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
for (auto row_id : rid_span) {
|
||||
auto gidx = gmat.GetGindex(row_id, fid);
|
||||
if (gidx > -1) {
|
||||
bool go_left = false;
|
||||
if (is_cat) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
} else {
|
||||
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
||||
}
|
||||
if (go_left) {
|
||||
decision_bits->Set(row_id - gmat.base_rowid);
|
||||
}
|
||||
} else {
|
||||
missing_bits->Set(row_id - gmat.base_rowid);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
|
||||
* use them to partition the rows accordingly.
|
||||
*/
|
||||
void PartitionByMask(const size_t node_in_set,
|
||||
std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
|
||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix, const RegTree& tree,
|
||||
const size_t* rid, BitVector const& decision_bits,
|
||||
BitVector const& missing_bits) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
|
||||
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bool default_left = tree[nid].DefaultLeft();
|
||||
|
||||
auto pred_approx = [&](auto ridx) {
|
||||
bool go_left = default_left;
|
||||
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
|
||||
if (!is_missing) {
|
||||
go_left = decision_bits.Check(ridx - gmat.base_rowid);
|
||||
}
|
||||
return go_left;
|
||||
};
|
||||
|
||||
std::pair<size_t, size_t> child_nodes_sizes;
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
|
||||
} else {
|
||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
||||
}
|
||||
|
||||
const size_t n_left = child_nodes_sizes.first;
|
||||
const size_t n_right = child_nodes_sizes.second;
|
||||
|
||||
SetNLeftElems(node_in_set, range.begin(), n_left);
|
||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||
}
|
||||
|
||||
// allocate thread local memory, should be called for each specific task
|
||||
void AllocateForTask(size_t id) {
|
||||
if (mem_blocks_[id].get() == nullptr) {
|
||||
BlockInfo* local_block_ptr = new BlockInfo;
|
||||
CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
|
||||
mem_blocks_[id].reset(local_block_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
|
||||
const size_t task_idx = GetTaskIdx(nid, begin);
|
||||
return { mem_blocks_.at(task_idx)->Left(), end - begin };
|
||||
}
|
||||
|
||||
common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
|
||||
const size_t task_idx = GetTaskIdx(nid, begin);
|
||||
return { mem_blocks_.at(task_idx)->Right(), end - begin };
|
||||
}
|
||||
|
||||
void SetNLeftElems(int nid, size_t begin, size_t n_left) {
|
||||
size_t task_idx = GetTaskIdx(nid, begin);
|
||||
mem_blocks_.at(task_idx)->n_left = n_left;
|
||||
}
|
||||
|
||||
void SetNRightElems(int nid, size_t begin, size_t n_right) {
|
||||
size_t task_idx = GetTaskIdx(nid, begin);
|
||||
mem_blocks_.at(task_idx)->n_right = n_right;
|
||||
}
|
||||
|
||||
|
||||
size_t GetNLeftElems(int nid) const {
|
||||
return left_right_nodes_sizes_[nid].first;
|
||||
}
|
||||
|
||||
size_t GetNRightElems(int nid) const {
|
||||
return left_right_nodes_sizes_[nid].second;
|
||||
}
|
||||
|
||||
// Each thread has partial results for some set of tree-nodes
|
||||
// The function decides order of merging partial results into final row set
|
||||
void CalculateRowOffsets() {
|
||||
for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
|
||||
size_t n_left = 0;
|
||||
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
|
||||
mem_blocks_[j]->n_offset_left = n_left;
|
||||
n_left += mem_blocks_[j]->n_left;
|
||||
}
|
||||
size_t n_right = 0;
|
||||
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
|
||||
mem_blocks_[j]->n_offset_right = n_left + n_right;
|
||||
n_right += mem_blocks_[j]->n_right;
|
||||
}
|
||||
left_right_nodes_sizes_[i] = {n_left, n_right};
|
||||
}
|
||||
}
|
||||
|
||||
void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
|
||||
size_t task_idx = GetTaskIdx(nid, begin);
|
||||
|
||||
size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
|
||||
size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
|
||||
|
||||
const size_t* left = mem_blocks_[task_idx]->Left();
|
||||
const size_t* right = mem_blocks_[task_idx]->Right();
|
||||
|
||||
std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
|
||||
std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
|
||||
}
|
||||
|
||||
size_t GetTaskIdx(int nid, size_t begin) {
|
||||
return blocks_offsets_[nid] + begin / BlockSize;
|
||||
}
|
||||
|
||||
// Copy row partitions into global cache for reuse in objective
|
||||
template <typename Sampledp>
|
||||
void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
|
||||
std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
|
||||
auto& h_pos = *p_position;
|
||||
h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
|
||||
|
||||
auto p_begin = row_set.Data()->data();
|
||||
ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
|
||||
auto const& node = row_set[i];
|
||||
if (node.node_id < 0) {
|
||||
return;
|
||||
}
|
||||
CHECK(tree[node.node_id].IsLeaf());
|
||||
if (node.begin) { // guard for empty node.
|
||||
size_t ptr_offset = node.end - p_begin;
|
||||
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
|
||||
for (auto idx = node.begin; idx != node.end; ++idx) {
|
||||
h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
protected:
|
||||
struct BlockInfo{
|
||||
size_t n_left;
|
||||
size_t n_right;
|
||||
|
||||
size_t n_offset_left;
|
||||
size_t n_offset_right;
|
||||
|
||||
size_t* Left() {
|
||||
return &left_data_[0];
|
||||
}
|
||||
|
||||
size_t* Right() {
|
||||
return &right_data_[0];
|
||||
}
|
||||
private:
|
||||
size_t left_data_[BlockSize];
|
||||
size_t right_data_[BlockSize];
|
||||
};
|
||||
std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
|
||||
std::vector<size_t> blocks_offsets_;
|
||||
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
|
||||
size_t max_n_tasks_ = 0;
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_
|
||||
/**
|
||||
* Copyright 2021-2023 by Contributors
|
||||
* \file row_set.h
|
||||
* \brief Quick Utility to compute subset of rows
|
||||
* \author Philip Cho, Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
|
||||
#define XGBOOST_COMMON_PARTITION_BUILDER_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "../tree/hist/expand_entry.h"
|
||||
#include "categorical.h"
|
||||
#include "column_matrix.h"
|
||||
#include "xgboost/context.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
|
||||
namespace xgboost::common {
|
||||
// The builder is required for samples partition to left and rights children for set of nodes
|
||||
// Responsible for:
|
||||
// 1) Effective memory allocation for intermediate results for multi-thread work
|
||||
// 2) Merging partial results produced by threads into original row set (row_set_collection_)
|
||||
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
|
||||
template<size_t BlockSize>
|
||||
class PartitionBuilder {
|
||||
using BitVector = RBitField8;
|
||||
|
||||
public:
|
||||
template<typename Func>
|
||||
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
|
||||
left_right_nodes_sizes_.resize(n_nodes);
|
||||
blocks_offsets_.resize(n_nodes+1);
|
||||
|
||||
blocks_offsets_[0] = 0;
|
||||
for (size_t i = 1; i < n_nodes+1; ++i) {
|
||||
blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
|
||||
}
|
||||
|
||||
if (n_tasks > max_n_tasks_) {
|
||||
mem_blocks_.resize(n_tasks);
|
||||
max_n_tasks_ = n_tasks;
|
||||
}
|
||||
}
|
||||
|
||||
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
|
||||
// on comparison of indexes values (idx_span) and split point (split_cond)
|
||||
// Handle dense columns
|
||||
// Analog of std::stable_partition, but in no-inplace manner
|
||||
template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
|
||||
inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
|
||||
common::Span<const size_t> row_indices,
|
||||
common::Span<size_t> left_part,
|
||||
common::Span<size_t> right_part,
|
||||
size_t base_rowid, Predicate&& pred) {
|
||||
auto& column = *p_column;
|
||||
size_t* p_left_part = left_part.data();
|
||||
size_t* p_right_part = right_part.data();
|
||||
size_t nleft_elems = 0;
|
||||
size_t nright_elems = 0;
|
||||
|
||||
auto p_row_indices = row_indices.data();
|
||||
auto n_samples = row_indices.size();
|
||||
|
||||
for (size_t i = 0; i < n_samples; ++i) {
|
||||
auto rid = p_row_indices[i];
|
||||
const int32_t bin_id = column[rid - base_rowid];
|
||||
if (any_missing && bin_id == ColumnType::kMissingId) {
|
||||
if (default_left) {
|
||||
p_left_part[nleft_elems++] = rid;
|
||||
} else {
|
||||
p_right_part[nright_elems++] = rid;
|
||||
}
|
||||
} else {
|
||||
if (pred(rid, bin_id)) {
|
||||
p_left_part[nleft_elems++] = rid;
|
||||
} else {
|
||||
p_right_part[nright_elems++] = rid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {nleft_elems, nright_elems};
|
||||
}
|
||||
|
||||
template <typename Pred>
|
||||
inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
|
||||
common::Span<size_t> left_part,
|
||||
common::Span<size_t> right_part,
|
||||
Pred pred) {
|
||||
size_t* p_left_part = left_part.data();
|
||||
size_t* p_right_part = right_part.data();
|
||||
size_t nleft_elems = 0;
|
||||
size_t nright_elems = 0;
|
||||
for (auto row_id : ridx) {
|
||||
if (pred(row_id)) {
|
||||
p_left_part[nleft_elems++] = row_id;
|
||||
} else {
|
||||
p_right_part[nright_elems++] = row_id;
|
||||
}
|
||||
}
|
||||
return {nleft_elems, nright_elems};
|
||||
}
|
||||
|
||||
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
|
||||
void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
|
||||
const common::Range1d range, const bst_bin_t split_cond,
|
||||
GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
|
||||
const RegTree& tree, const size_t* rid) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
|
||||
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bst_feature_t fid = tree.SplitIndex(nid);
|
||||
bool default_left = tree.DefaultLeft(nid);
|
||||
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
|
||||
auto node_cats = tree.NodeCats(nid);
|
||||
auto const& cut_values = gmat.cut.Values();
|
||||
|
||||
auto pred_hist = [&](auto ridx, auto bin_id) {
|
||||
if (any_cat && is_cat) {
|
||||
auto gidx = gmat.GetGindex(ridx, fid);
|
||||
bool go_left = default_left;
|
||||
if (gidx > -1) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
}
|
||||
return go_left;
|
||||
} else {
|
||||
return bin_id <= split_cond;
|
||||
}
|
||||
};
|
||||
|
||||
auto pred_approx = [&](auto ridx) {
|
||||
auto gidx = gmat.GetGindex(ridx, fid);
|
||||
bool go_left = default_left;
|
||||
if (gidx > -1) {
|
||||
if (is_cat) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
} else {
|
||||
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
||||
}
|
||||
}
|
||||
return go_left;
|
||||
};
|
||||
|
||||
std::pair<size_t, size_t> child_nodes_sizes;
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
|
||||
} else {
|
||||
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
|
||||
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
|
||||
if (default_left) {
|
||||
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
} else {
|
||||
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
}
|
||||
} else {
|
||||
CHECK_EQ(any_missing, true);
|
||||
auto column =
|
||||
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
|
||||
if (default_left) {
|
||||
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
} else {
|
||||
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
|
||||
gmat.base_rowid, pred_hist);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const size_t n_left = child_nodes_sizes.first;
|
||||
const size_t n_right = child_nodes_sizes.second;
|
||||
|
||||
SetNLeftElems(node_in_set, range.begin(), n_left);
|
||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief When data is split by column, we don't have all the features locally on the current
|
||||
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
|
||||
* to go right, or if the feature value used for the split is missing.
|
||||
*/
|
||||
template <typename ExpandEntry>
|
||||
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
|
||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
|
||||
BitVector* decision_bits, BitVector* missing_bits) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bst_feature_t fid = tree[nid].SplitIndex();
|
||||
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
|
||||
auto node_cats = tree.NodeCats(nid);
|
||||
auto const& cut_values = gmat.cut.Values();
|
||||
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
for (auto row_id : rid_span) {
|
||||
auto gidx = gmat.GetGindex(row_id, fid);
|
||||
if (gidx > -1) {
|
||||
bool go_left = false;
|
||||
if (is_cat) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
} else {
|
||||
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
||||
}
|
||||
if (go_left) {
|
||||
decision_bits->Set(row_id - gmat.base_rowid);
|
||||
}
|
||||
} else {
|
||||
missing_bits->Set(row_id - gmat.base_rowid);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
|
||||
* use them to partition the rows accordingly.
|
||||
*/
|
||||
template <typename ExpandEntry>
|
||||
void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
|
||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix, const RegTree& tree,
|
||||
const size_t* rid, BitVector const& decision_bits,
|
||||
BitVector const& missing_bits) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
|
||||
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bool default_left = tree[nid].DefaultLeft();
|
||||
|
||||
auto pred_approx = [&](auto ridx) {
|
||||
bool go_left = default_left;
|
||||
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
|
||||
if (!is_missing) {
|
||||
go_left = decision_bits.Check(ridx - gmat.base_rowid);
|
||||
}
|
||||
return go_left;
|
||||
};
|
||||
|
||||
std::pair<size_t, size_t> child_nodes_sizes;
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
|
||||
} else {
|
||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
||||
}
|
||||
|
||||
const size_t n_left = child_nodes_sizes.first;
|
||||
const size_t n_right = child_nodes_sizes.second;
|
||||
|
||||
SetNLeftElems(node_in_set, range.begin(), n_left);
|
||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||
}
|
||||
|
||||
// allocate thread local memory, should be called for each specific task
|
||||
void AllocateForTask(size_t id) {
|
||||
if (mem_blocks_[id].get() == nullptr) {
|
||||
BlockInfo* local_block_ptr = new BlockInfo;
|
||||
CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
|
||||
mem_blocks_[id].reset(local_block_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
|
||||
const size_t task_idx = GetTaskIdx(nid, begin);
|
||||
return { mem_blocks_.at(task_idx)->Left(), end - begin };
|
||||
}
|
||||
|
||||
common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
|
||||
const size_t task_idx = GetTaskIdx(nid, begin);
|
||||
return { mem_blocks_.at(task_idx)->Right(), end - begin };
|
||||
}
|
||||
|
||||
void SetNLeftElems(int nid, size_t begin, size_t n_left) {
|
||||
size_t task_idx = GetTaskIdx(nid, begin);
|
||||
mem_blocks_.at(task_idx)->n_left = n_left;
|
||||
}
|
||||
|
||||
void SetNRightElems(int nid, size_t begin, size_t n_right) {
|
||||
size_t task_idx = GetTaskIdx(nid, begin);
|
||||
mem_blocks_.at(task_idx)->n_right = n_right;
|
||||
}
|
||||
|
||||
|
||||
[[nodiscard]] std::size_t GetNLeftElems(int nid) const {
|
||||
return left_right_nodes_sizes_[nid].first;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t GetNRightElems(int nid) const {
|
||||
return left_right_nodes_sizes_[nid].second;
|
||||
}
|
||||
|
||||
// Each thread has partial results for some set of tree-nodes
|
||||
// The function decides order of merging partial results into final row set
|
||||
void CalculateRowOffsets() {
|
||||
for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
|
||||
size_t n_left = 0;
|
||||
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
|
||||
mem_blocks_[j]->n_offset_left = n_left;
|
||||
n_left += mem_blocks_[j]->n_left;
|
||||
}
|
||||
size_t n_right = 0;
|
||||
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
|
||||
mem_blocks_[j]->n_offset_right = n_left + n_right;
|
||||
n_right += mem_blocks_[j]->n_right;
|
||||
}
|
||||
left_right_nodes_sizes_[i] = {n_left, n_right};
|
||||
}
|
||||
}
|
||||
|
||||
void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
|
||||
size_t task_idx = GetTaskIdx(nid, begin);
|
||||
|
||||
size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
|
||||
size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
|
||||
|
||||
const size_t* left = mem_blocks_[task_idx]->Left();
|
||||
const size_t* right = mem_blocks_[task_idx]->Right();
|
||||
|
||||
std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
|
||||
std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
|
||||
}
|
||||
|
||||
size_t GetTaskIdx(int nid, size_t begin) {
|
||||
return blocks_offsets_[nid] + begin / BlockSize;
|
||||
}
|
||||
|
||||
// Copy row partitions into global cache for reuse in objective
|
||||
template <typename Sampledp>
|
||||
void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
|
||||
std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
|
||||
auto& h_pos = *p_position;
|
||||
h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
|
||||
|
||||
auto p_begin = row_set.Data()->data();
|
||||
ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
|
||||
auto const& node = row_set[i];
|
||||
if (node.node_id < 0) {
|
||||
return;
|
||||
}
|
||||
CHECK(tree.IsLeaf(node.node_id));
|
||||
if (node.begin) { // guard for empty node.
|
||||
size_t ptr_offset = node.end - p_begin;
|
||||
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
|
||||
for (auto idx = node.begin; idx != node.end; ++idx) {
|
||||
h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
protected:
|
||||
struct BlockInfo{
|
||||
size_t n_left;
|
||||
size_t n_right;
|
||||
|
||||
size_t n_offset_left;
|
||||
size_t n_offset_right;
|
||||
|
||||
size_t* Left() {
|
||||
return &left_data_[0];
|
||||
}
|
||||
|
||||
size_t* Right() {
|
||||
return &right_data_[0];
|
||||
}
|
||||
private:
|
||||
size_t left_data_[BlockSize];
|
||||
size_t right_data_[BlockSize];
|
||||
};
|
||||
std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
|
||||
std::vector<size_t> blocks_offsets_;
|
||||
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
|
||||
size_t max_n_tasks_ = 0;
|
||||
};
|
||||
} // namespace xgboost::common
|
||||
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_
|
||||
|
||||
@@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
|
||||
HistogramCuts *cuts) {
|
||||
size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
|
||||
auto &cut_values = cuts->cut_values_.HostVector();
|
||||
// we use the min_value as the first (0th) element, hence starting from 1.
|
||||
for (size_t i = 1; i < required_cuts; ++i) {
|
||||
bst_float cpt = summary.data[i].value;
|
||||
if (i == 1 || cpt > cut_values.back()) {
|
||||
@@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
|
||||
} else {
|
||||
AddCutPoint<WQSketch>(a, max_num_bins, cuts);
|
||||
// push a value that is greater than anything
|
||||
const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
|
||||
: cuts->min_vals_.HostVector()[fid];
|
||||
const bst_float cpt =
|
||||
(a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
|
||||
// this must be bigger than last value in a scale
|
||||
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
|
||||
cuts->cut_values_.HostVector().push_back(last);
|
||||
|
||||
@@ -352,19 +352,6 @@ struct WQSummary {
|
||||
prev_rmax = data[i].rmax;
|
||||
}
|
||||
}
|
||||
// check consistency of the summary
|
||||
inline bool Check(const char *msg) const {
|
||||
const float tol = 10.0f;
|
||||
for (size_t i = 0; i < this->size; ++i) {
|
||||
if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
|
||||
data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
|
||||
LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
|
||||
this->Print();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief try to do efficient pruning */
|
||||
|
||||
@@ -6,9 +6,7 @@
|
||||
#include <algorithm> // for copy_n, max, min, none_of, all_of
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdio> // for sscanf
|
||||
#include <exception> // for exception
|
||||
#include <functional> // for greater
|
||||
#include <iterator> // for reverse_iterator
|
||||
#include <string> // for char_traits, string
|
||||
|
||||
#include "algorithm.h" // for ArgSort
|
||||
@@ -18,12 +16,113 @@
|
||||
#include "xgboost/base.h" // for bst_group_t
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for MetaInfo
|
||||
#include "xgboost/linalg.h" // for All, TensorView, Range, Tensor, Vector
|
||||
#include "xgboost/logging.h" // for Error, LogCheck_EQ, CHECK_EQ
|
||||
#include "xgboost/linalg.h" // for All, TensorView, Range
|
||||
#include "xgboost/logging.h" // for CHECK_EQ
|
||||
|
||||
namespace xgboost::ltr {
|
||||
void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
|
||||
if (info.group_ptr_.empty()) {
|
||||
group_ptr_.Resize(2, 0);
|
||||
group_ptr_.HostVector()[1] = info.num_row_;
|
||||
} else {
|
||||
group_ptr_.HostVector() = info.group_ptr_;
|
||||
}
|
||||
|
||||
auto const& gptr = group_ptr_.ConstHostVector();
|
||||
for (std::size_t i = 1; i < gptr.size(); ++i) {
|
||||
std::size_t n = gptr[i] - gptr[i - 1];
|
||||
max_group_size_ = std::max(max_group_size_, n);
|
||||
}
|
||||
|
||||
double sum_weights = 0;
|
||||
auto n_groups = Groups();
|
||||
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
|
||||
for (bst_omp_uint k = 0; k < n_groups; ++k) {
|
||||
sum_weights += weight[k];
|
||||
}
|
||||
weight_norm_ = static_cast<double>(n_groups) / sum_weights;
|
||||
}
|
||||
|
||||
common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
|
||||
common::Span<float const> predt) {
|
||||
auto gptr = this->DataGroupPtr(ctx);
|
||||
auto rank = this->sorted_idx_cache_.HostSpan();
|
||||
CHECK_EQ(rank.size(), predt.size());
|
||||
|
||||
common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
|
||||
auto cnt = gptr[g + 1] - gptr[g];
|
||||
auto g_predt = predt.subspan(gptr[g], cnt);
|
||||
auto g_rank = rank.subspan(gptr[g], cnt);
|
||||
auto sorted_idx = common::ArgSort<std::size_t>(
|
||||
ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
|
||||
CHECK_EQ(g_rank.size(), sorted_idx.size());
|
||||
std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
|
||||
});
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
|
||||
common::Span<float const>) {
|
||||
common::AssertGPUSupport();
|
||||
return {};
|
||||
}
|
||||
#endif // !defined()
|
||||
|
||||
void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
|
||||
auto const h_group_ptr = this->DataGroupPtr(ctx);
|
||||
|
||||
discounts_.Resize(MaxGroupSize(), 0);
|
||||
auto& h_discounts = discounts_.HostVector();
|
||||
for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
|
||||
h_discounts[i] = CalcDCGDiscount(i);
|
||||
}
|
||||
|
||||
auto n_groups = h_group_ptr.size() - 1;
|
||||
auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
|
||||
|
||||
CheckNDCGLabels(this->Param(), h_labels,
|
||||
[](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
|
||||
|
||||
inv_idcg_.Reshape(n_groups);
|
||||
auto h_inv_idcg = inv_idcg_.HostView();
|
||||
std::size_t topk = this->Param().TopK();
|
||||
auto const exp_gain = this->Param().ndcg_exp_gain;
|
||||
|
||||
common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
|
||||
auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
|
||||
auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
|
||||
linalg::cend(g_labels), std::greater<>{});
|
||||
|
||||
double idcg{0.0};
|
||||
for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
|
||||
if (exp_gain) {
|
||||
idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
|
||||
} else {
|
||||
idcg += h_discounts[i] * g_labels(sorted_idx[i]);
|
||||
}
|
||||
}
|
||||
h_inv_idcg(g) = CalcInvIDCG(idcg);
|
||||
});
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
DMLC_REGISTER_PARAMETER(LambdaRankParam);
|
||||
|
||||
void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
|
||||
auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
|
||||
CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
|
||||
std::string out_name;
|
||||
if (!param.empty()) {
|
||||
|
||||
212
src/common/ranking_utils.cu
Normal file
212
src/common/ranking_utils.cu
Normal file
@@ -0,0 +1,212 @@
|
||||
/**
|
||||
* Copyright 2023 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/functional.h> // for maximum
|
||||
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
|
||||
#include <thrust/logical.h> // for none_of, all_of
|
||||
#include <thrust/pair.h> // for pair, make_pair
|
||||
#include <thrust/reduce.h> // for reduce
|
||||
#include <thrust/scan.h> // for inclusive_scan
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "algorithm.cuh" // for SegmentedArgSort
|
||||
#include "cuda_context.cuh" // for CUDAContext
|
||||
#include "device_helpers.cuh" // for MakeTransformIterator, LaunchN
|
||||
#include "optional_weight.h" // for MakeOptionalWeights, OptionalWeights
|
||||
#include "ranking_utils.cuh" // for ThreadsForMean
|
||||
#include "ranking_utils.h"
|
||||
#include "threading_utils.cuh" // for SegmentedTrapezoidThreads
|
||||
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/linalg.h" // for VectorView, All, Range
|
||||
#include "xgboost/logging.h" // for CHECK
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::ltr {
|
||||
namespace cuda_impl {
|
||||
void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
|
||||
common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
|
||||
common::Span<bst_group_t const> d_group_ptr, std::size_t k,
|
||||
linalg::VectorView<double> out_dcg) {
|
||||
CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
|
||||
using IdxGroup = thrust::pair<std::size_t, std::size_t>;
|
||||
auto group_it = dh::MakeTransformIterator<IdxGroup>(
|
||||
thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
|
||||
return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx)); // NOLINT
|
||||
});
|
||||
auto value_it = dh::MakeTransformIterator<double>(
|
||||
group_it,
|
||||
[exp_gain, d_labels, d_group_ptr, k,
|
||||
d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
|
||||
auto g_begin = d_group_ptr[l.second];
|
||||
auto g_size = d_group_ptr[l.second + 1] - g_begin;
|
||||
|
||||
auto idx_in_group = l.first - g_begin;
|
||||
if (idx_in_group >= k) {
|
||||
return 0.0;
|
||||
}
|
||||
double gain{0.0};
|
||||
auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
|
||||
auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
|
||||
|
||||
if (exp_gain) {
|
||||
gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
|
||||
} else {
|
||||
gain = g_labels(g_sorted_idx[idx_in_group]);
|
||||
}
|
||||
double discount = CalcDCGDiscount(idx_in_group);
|
||||
return gain * discount;
|
||||
});
|
||||
|
||||
CHECK(out_dcg.Contiguous());
|
||||
std::size_t bytes;
|
||||
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
|
||||
d_group_ptr.size() - 1, d_group_ptr.data(),
|
||||
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
|
||||
dh::TemporaryArray<char> temp(bytes);
|
||||
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
|
||||
d_group_ptr.size() - 1, d_group_ptr.data(),
|
||||
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
|
||||
}
|
||||
|
||||
void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
|
||||
common::Span<bst_group_t const> d_group_ptr,
|
||||
linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
|
||||
CHECK_GE(d_group_ptr.size(), 2ul);
|
||||
size_t n_groups = d_group_ptr.size() - 1;
|
||||
CHECK_EQ(out_inv_IDCG.Size(), n_groups);
|
||||
dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
|
||||
auto d_sorted_idx = dh::ToSpan(sorted_idx);
|
||||
common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
|
||||
CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
|
||||
dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
|
||||
[out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
|
||||
double idcg = out_inv_IDCG(idx);
|
||||
out_inv_IDCG(idx) = CalcInvIDCG(idcg);
|
||||
});
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
|
||||
namespace {
|
||||
struct CheckNDCGOp {
|
||||
CUDAContext const* cuctx;
|
||||
template <typename It, typename Op>
|
||||
bool operator()(It beg, It end, Op op) {
|
||||
return thrust::none_of(cuctx->CTP(), beg, end, op);
|
||||
}
|
||||
};
|
||||
struct CheckMAPOp {
|
||||
CUDAContext const* cuctx;
|
||||
template <typename It, typename Op>
|
||||
bool operator()(It beg, It end, Op op) {
|
||||
return thrust::all_of(cuctx->CTP(), beg, end, op);
|
||||
}
|
||||
};
|
||||
|
||||
struct ThreadGroupOp {
|
||||
common::Span<bst_group_t const> d_group_ptr;
|
||||
std::size_t n_pairs;
|
||||
|
||||
common::Span<std::size_t> out_thread_group_ptr;
|
||||
|
||||
XGBOOST_DEVICE void operator()(std::size_t i) {
|
||||
out_thread_group_ptr[i + 1] =
|
||||
cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
|
||||
}
|
||||
};
|
||||
|
||||
struct GroupSizeOp {
|
||||
common::Span<bst_group_t const> d_group_ptr;
|
||||
|
||||
XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
|
||||
return d_group_ptr[i + 1] - d_group_ptr[i];
|
||||
}
|
||||
};
|
||||
|
||||
struct WeightOp {
|
||||
common::OptionalWeights d_weight;
|
||||
XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
CUDAContext const* cuctx = ctx->CUDACtx();
|
||||
|
||||
group_ptr_.SetDevice(ctx->gpu_id);
|
||||
if (info.group_ptr_.empty()) {
|
||||
group_ptr_.Resize(2, 0);
|
||||
group_ptr_.HostVector()[1] = info.num_row_;
|
||||
} else {
|
||||
auto const& h_group_ptr = info.group_ptr_;
|
||||
group_ptr_.Resize(h_group_ptr.size());
|
||||
auto d_group_ptr = group_ptr_.DeviceSpan();
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, cuctx->Stream()));
|
||||
}
|
||||
|
||||
auto d_group_ptr = DataGroupPtr(ctx);
|
||||
std::size_t n_groups = Groups();
|
||||
|
||||
auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
|
||||
GroupSizeOp{d_group_ptr});
|
||||
max_group_size_ =
|
||||
thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
|
||||
|
||||
threads_group_ptr_.SetDevice(ctx->gpu_id);
|
||||
threads_group_ptr_.Resize(n_groups + 1, 0);
|
||||
auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
|
||||
if (param_.HasTruncation()) {
|
||||
n_cuda_threads_ =
|
||||
common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
|
||||
} else {
|
||||
auto n_pairs = Param().NumPair();
|
||||
dh::LaunchN(n_groups, cuctx->Stream(),
|
||||
ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
|
||||
thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
|
||||
dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
|
||||
n_cuda_threads_ = info.num_row_ * param_.NumPair();
|
||||
}
|
||||
|
||||
sorted_idx_cache_.SetDevice(ctx->gpu_id);
|
||||
sorted_idx_cache_.Resize(info.labels.Size(), 0);
|
||||
|
||||
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
|
||||
auto w_it =
|
||||
dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
|
||||
weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
|
||||
}
|
||||
|
||||
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
|
||||
common::Span<float const> predt) {
|
||||
auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
|
||||
auto d_group_ptr = DataGroupPtr(ctx);
|
||||
common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
|
||||
return d_sorted_idx;
|
||||
}
|
||||
|
||||
void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
CUDAContext const* cuctx = ctx->CUDACtx();
|
||||
auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
|
||||
CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
|
||||
|
||||
auto d_group_ptr = this->DataGroupPtr(ctx);
|
||||
|
||||
std::size_t n_groups = d_group_ptr.size() - 1;
|
||||
inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
|
||||
auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
|
||||
cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
|
||||
CHECK_GE(this->Param().NumPair(), 1ul);
|
||||
|
||||
discounts_.SetDevice(ctx->gpu_id);
|
||||
discounts_.Resize(MaxGroupSize());
|
||||
auto d_discount = discounts_.DeviceSpan();
|
||||
dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
|
||||
}
|
||||
|
||||
void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
|
||||
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
|
||||
CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
|
||||
}
|
||||
} // namespace xgboost::ltr
|
||||
40
src/common/ranking_utils.cuh
Normal file
40
src/common/ranking_utils.cuh
Normal file
@@ -0,0 +1,40 @@
|
||||
/**
|
||||
* Copyright 2023 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
|
||||
#define XGBOOST_COMMON_RANKING_UTILS_CUH_
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "ranking_utils.h" // for LambdaRankParam
|
||||
#include "xgboost/base.h" // for bst_group_t, XGBOOST_DEVICE
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/linalg.h" // for VectorView
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost {
|
||||
namespace ltr {
|
||||
namespace cuda_impl {
|
||||
void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
|
||||
common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
|
||||
common::Span<bst_group_t const> d_group_ptr, std::size_t k,
|
||||
linalg::VectorView<double> out_dcg);
|
||||
|
||||
void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
|
||||
common::Span<bst_group_t const> d_group_ptr,
|
||||
linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
|
||||
|
||||
// Functions for creating number of threads for CUDA, and getting back the number of pairs
|
||||
// from the number of threads.
|
||||
XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
|
||||
std::size_t n_pairs) {
|
||||
return group_size * n_pairs;
|
||||
}
|
||||
XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
|
||||
std::size_t group_size) {
|
||||
return n_threads / group_size;
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace ltr
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_RANKING_UTILS_CUH_
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <string> // for char_traits, string
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "./math.h" // for CloseTo
|
||||
#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD
|
||||
#include "error_msg.h" // for GroupWeight, GroupSize
|
||||
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
|
||||
@@ -19,7 +18,7 @@
|
||||
#include "xgboost/data.h" // for MetaInfo
|
||||
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
||||
#include "xgboost/linalg.h" // for Vector, VectorView, Tensor
|
||||
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK
|
||||
#include "xgboost/logging.h" // for CHECK_EQ, CHECK
|
||||
#include "xgboost/parameter.h" // for XGBoostParameter
|
||||
#include "xgboost/span.h" // for Span
|
||||
#include "xgboost/string_view.h" // for StringView
|
||||
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t; // NOLINT
|
||||
*/
|
||||
using position_t = std::uint32_t; // NOLINT
|
||||
|
||||
/**
|
||||
* \brief Maximum relevance degree for NDCG
|
||||
*/
|
||||
constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
|
||||
static_assert(MaxRel() == 31);
|
||||
|
||||
XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
|
||||
return static_cast<double>((1u << label) - 1);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
|
||||
return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
|
||||
auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg)); // handle irrelevant document
|
||||
return inv_idcg;
|
||||
}
|
||||
|
||||
enum class PairMethod : std::int32_t {
|
||||
kTopK = 0,
|
||||
kMean = 1,
|
||||
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
|
||||
.describe("Number of pairs for each sample in the list.");
|
||||
DMLC_DECLARE_FIELD(lambdarank_unbiased)
|
||||
.set_default(false)
|
||||
.describe("Unbiased lambda mart. Use IPW to debias click position");
|
||||
.describe("Unbiased lambda mart. Use extended IPW to debias click position");
|
||||
DMLC_DECLARE_FIELD(lambdarank_bias_norm)
|
||||
.set_default(2.0)
|
||||
.set_lower_bound(0.0)
|
||||
@@ -126,6 +144,285 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Common cached items for ranking tasks.
|
||||
*/
|
||||
class RankingCache {
|
||||
private:
|
||||
void InitOnCPU(Context const* ctx, MetaInfo const& info);
|
||||
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
|
||||
// Cached parameter
|
||||
LambdaRankParam param_;
|
||||
// offset to data groups.
|
||||
HostDeviceVector<bst_group_t> group_ptr_;
|
||||
// store the sorted index of prediction.
|
||||
HostDeviceVector<std::size_t> sorted_idx_cache_;
|
||||
// Maximum size of group
|
||||
std::size_t max_group_size_{0};
|
||||
// Normalization for weight
|
||||
double weight_norm_{1.0};
|
||||
/**
|
||||
* CUDA cache
|
||||
*/
|
||||
// offset to threads assigned to each group for gradient calculation
|
||||
HostDeviceVector<std::size_t> threads_group_ptr_;
|
||||
// Sorted index of label for finding buckets.
|
||||
HostDeviceVector<std::size_t> y_sorted_idx_cache_;
|
||||
// Cached labels sorted by the model
|
||||
HostDeviceVector<float> y_ranked_by_model_;
|
||||
// store rounding factor for objective for each group
|
||||
linalg::Vector<GradientPair> roundings_;
|
||||
// rounding factor for cost
|
||||
HostDeviceVector<double> cost_rounding_;
|
||||
// temporary storage for creating rounding factors. Stored as byte to avoid having cuda
|
||||
// data structure in here.
|
||||
HostDeviceVector<std::uint8_t> max_lambdas_;
|
||||
// total number of cuda threads used for gradient calculation
|
||||
std::size_t n_cuda_threads_{0};
|
||||
|
||||
// Create model rank list on GPU
|
||||
common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
|
||||
common::Span<float const> predt);
|
||||
// Create model rank list on CPU
|
||||
common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
|
||||
common::Span<float const> predt);
|
||||
|
||||
protected:
|
||||
[[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
|
||||
|
||||
public:
|
||||
RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
|
||||
CHECK(param_.GetInitialised());
|
||||
if (!info.group_ptr_.empty()) {
|
||||
CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
|
||||
<< error::GroupSize() << "the size of label.";
|
||||
}
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
}
|
||||
if (!info.weights_.Empty()) {
|
||||
CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
|
||||
}
|
||||
}
|
||||
[[nodiscard]] std::size_t MaxPositionSize() const {
|
||||
// Use truncation level as bound.
|
||||
if (param_.HasTruncation()) {
|
||||
return param_.NumPair();
|
||||
}
|
||||
// Hardcoded maximum size of positions to track. We don't need too many of them as the
|
||||
// bias decreases exponentially.
|
||||
return std::min(max_group_size_, static_cast<std::size_t>(32));
|
||||
}
|
||||
// Constructed as [1, n_samples] if group ptr is not supplied by the user
|
||||
common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
|
||||
group_ptr_.SetDevice(ctx->gpu_id);
|
||||
return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
|
||||
}
|
||||
|
||||
[[nodiscard]] auto const& Param() const { return param_; }
|
||||
[[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
|
||||
[[nodiscard]] double WeightNorm() const { return weight_norm_; }
|
||||
|
||||
// Create a rank list by model prediction
|
||||
common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
|
||||
if (sorted_idx_cache_.Empty()) {
|
||||
sorted_idx_cache_.SetDevice(ctx->gpu_id);
|
||||
sorted_idx_cache_.Resize(predt.size());
|
||||
}
|
||||
if (ctx->IsCPU()) {
|
||||
return this->MakeRankOnCPU(ctx, predt);
|
||||
} else {
|
||||
return this->MakeRankOnCUDA(ctx, predt);
|
||||
}
|
||||
}
|
||||
// The function simply returns a uninitialized buffer as this is only used by the
|
||||
// objective for creating pairs.
|
||||
common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
|
||||
CHECK(ctx->IsCUDA());
|
||||
if (y_sorted_idx_cache_.Empty()) {
|
||||
y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
|
||||
y_sorted_idx_cache_.Resize(n_samples);
|
||||
}
|
||||
return y_sorted_idx_cache_.DeviceSpan();
|
||||
}
|
||||
common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
|
||||
CHECK(ctx->IsCUDA());
|
||||
if (y_ranked_by_model_.Empty()) {
|
||||
y_ranked_by_model_.SetDevice(ctx->gpu_id);
|
||||
y_ranked_by_model_.Resize(n_samples);
|
||||
}
|
||||
return y_ranked_by_model_.DeviceSpan();
|
||||
}
|
||||
|
||||
// CUDA cache getters, the cache is shared between metric and objective, some of these
|
||||
// fields are lazy initialized to avoid unnecessary allocation.
|
||||
[[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
|
||||
CHECK(!threads_group_ptr_.Empty());
|
||||
return threads_group_ptr_.ConstDeviceSpan();
|
||||
}
|
||||
[[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
|
||||
|
||||
linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
|
||||
if (roundings_.Size() == 0) {
|
||||
roundings_.SetDevice(ctx->gpu_id);
|
||||
roundings_.Reshape(Groups());
|
||||
}
|
||||
return roundings_.View(ctx->gpu_id);
|
||||
}
|
||||
common::Span<double> CUDACostRounding(Context const* ctx) {
|
||||
if (cost_rounding_.Size() == 0) {
|
||||
cost_rounding_.SetDevice(ctx->gpu_id);
|
||||
cost_rounding_.Resize(1);
|
||||
}
|
||||
return cost_rounding_.DeviceSpan();
|
||||
}
|
||||
template <typename Type>
|
||||
common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
|
||||
max_lambdas_.SetDevice(ctx->gpu_id);
|
||||
std::size_t bytes = n * sizeof(Type);
|
||||
if (bytes != max_lambdas_.Size()) {
|
||||
max_lambdas_.Resize(bytes);
|
||||
}
|
||||
return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
|
||||
}
|
||||
};
|
||||
|
||||
class NDCGCache : public RankingCache {
|
||||
// NDCG discount
|
||||
HostDeviceVector<double> discounts_;
|
||||
// 1.0 / IDCG
|
||||
linalg::Vector<double> inv_idcg_;
|
||||
/**
|
||||
* CUDA cache
|
||||
*/
|
||||
// store the intermediate DCG calculation result for metric
|
||||
linalg::Vector<double> dcg_;
|
||||
|
||||
public:
|
||||
void InitOnCPU(Context const* ctx, MetaInfo const& info);
|
||||
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
|
||||
|
||||
public:
|
||||
NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
|
||||
: RankingCache{ctx, info, p} {
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
}
|
||||
}
|
||||
|
||||
linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
|
||||
return inv_idcg_.View(ctx->gpu_id);
|
||||
}
|
||||
common::Span<double const> Discount(Context const* ctx) const {
|
||||
return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
|
||||
}
|
||||
linalg::VectorView<double> Dcg(Context const* ctx) {
|
||||
if (dcg_.Size() == 0) {
|
||||
dcg_.SetDevice(ctx->gpu_id);
|
||||
dcg_.Reshape(this->Groups());
|
||||
}
|
||||
return dcg_.View(ctx->gpu_id);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Validate label for NDCG
|
||||
*
|
||||
* \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
|
||||
* check for both CPU and GPU.
|
||||
*/
|
||||
template <typename NoneOf>
|
||||
void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
|
||||
NoneOf none_of) {
|
||||
auto d_labels = labels.Values();
|
||||
if (p.ndcg_exp_gain) {
|
||||
auto label_is_integer =
|
||||
none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
|
||||
auto l = std::floor(v);
|
||||
return std::fabs(l - v) > kRtEps || v < 0.0f;
|
||||
});
|
||||
CHECK(label_is_integer)
|
||||
<< "When using relevance degree as target, label must be either 0 or positive integer.";
|
||||
}
|
||||
|
||||
if (p.ndcg_exp_gain) {
|
||||
auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
|
||||
[] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
|
||||
CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
|
||||
<< " when the exponential NDCG gain function is used. "
|
||||
<< "Set `ndcg_exp_gain` to false to use custom DCG gain.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename AllOf>
|
||||
bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
|
||||
auto s_label = label.Values();
|
||||
return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
|
||||
return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
|
||||
});
|
||||
}
|
||||
/**
|
||||
* \brief Validate label for MAP
|
||||
*
|
||||
* \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
|
||||
* both CPU and GPU.
|
||||
*/
|
||||
template <typename AllOf>
|
||||
void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
|
||||
auto s_label = label.Values();
|
||||
auto is_binary = IsBinaryRel(label, all_of);
|
||||
CHECK(is_binary) << "MAP can only be used with binary labels.";
|
||||
}
|
||||
|
||||
class MAPCache : public RankingCache {
|
||||
// Total number of relevant documents for each group
|
||||
HostDeviceVector<double> n_rel_;
|
||||
// \sum l_k/k
|
||||
HostDeviceVector<double> acc_;
|
||||
HostDeviceVector<double> map_;
|
||||
// Number of samples in this dataset.
|
||||
std::size_t n_samples_{0};
|
||||
|
||||
void InitOnCPU(Context const* ctx, MetaInfo const& info);
|
||||
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
|
||||
|
||||
public:
|
||||
MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
|
||||
: RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
}
|
||||
}
|
||||
|
||||
common::Span<double> NumRelevant(Context const* ctx) {
|
||||
if (n_rel_.Empty()) {
|
||||
n_rel_.SetDevice(ctx->gpu_id);
|
||||
n_rel_.Resize(n_samples_);
|
||||
}
|
||||
return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
|
||||
}
|
||||
common::Span<double> Acc(Context const* ctx) {
|
||||
if (acc_.Empty()) {
|
||||
acc_.SetDevice(ctx->gpu_id);
|
||||
acc_.Resize(n_samples_);
|
||||
}
|
||||
return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
|
||||
}
|
||||
common::Span<double> Map(Context const* ctx) {
|
||||
if (map_.Empty()) {
|
||||
map_.SetDevice(ctx->gpu_id);
|
||||
map_.Resize(this->Groups());
|
||||
}
|
||||
return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Parse name for ranking metric given parameters.
|
||||
*
|
||||
|
||||
@@ -8,9 +8,11 @@
|
||||
#include <dmlc/omp.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint> // std::int32_t
|
||||
#include <cstdint> // for int32_t
|
||||
#include <cstdlib> // for malloc, free
|
||||
#include <limits>
|
||||
#include <type_traits> // std::is_signed
|
||||
#include <new> // for bad_alloc
|
||||
#include <type_traits> // for is_signed
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
@@ -266,7 +268,7 @@ class MemStackAllocator {
|
||||
if (MaxStackSize >= required_size_) {
|
||||
ptr_ = stack_mem_;
|
||||
} else {
|
||||
ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
|
||||
ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
|
||||
}
|
||||
if (!ptr_) {
|
||||
throw std::bad_alloc{};
|
||||
@@ -278,7 +280,7 @@ class MemStackAllocator {
|
||||
|
||||
~MemStackAllocator() {
|
||||
if (required_size_ > MaxStackSize) {
|
||||
free(ptr_);
|
||||
std::free(ptr_);
|
||||
}
|
||||
}
|
||||
T& operator[](size_t i) { return ptr_[i]; }
|
||||
|
||||
Reference in New Issue
Block a user