initial merge

This commit is contained in:
amdsc21
2023-03-25 04:31:55 +01:00
146 changed files with 6730 additions and 4082 deletions

View File

@@ -55,6 +55,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
*out_dim = 2;
shape.resize(*out_dim);
shape.front() = rows;
// chunksize can be 1 if it's softmax
shape.back() = std::min(groups, chunksize);
}
break;

View File

@@ -14,7 +14,7 @@
// clang with libstdc++ works as well
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
!defined(__APPLE__) && __has_include(<omp.h>)
!defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
#define GCC_HAS_PARALLEL 1
#endif // GLIC_VERSION

View File

@@ -121,17 +121,20 @@ namespace dh {
#ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) {
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code);
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str();
}

View File

@@ -2,6 +2,9 @@
* Copyright 2017-2023 XGBoost contributors
*/
#pragma once
#if defined(XGBOOST_USE_CUDA)
#include <thrust/binary_search.h> // thrust::upper_bound
#include <thrust/device_malloc_allocator.h>
#include <thrust/device_ptr.h>
@@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT
}
namespace dh {
#ifdef XGBOOST_USE_NCCL
#ifdef XGBOOST_USE_RCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) {
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure :" << ncclGetErrorString(code);
ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = hipPeekAtLastError();
ss << " " << thrust::system_error(err, thrust::hip_category()).what();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
ss << " " << file << "(" << line << ")";
LOG(FATAL) << ss.str();
}

View File

@@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
constexpr StringView LabelScoreSize() {
return "The size of label doesn't match the size of prediction.";
}
constexpr StringView InfInData() {
return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
}
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_

View File

@@ -7,23 +7,22 @@
#ifndef XGBOOST_COMMON_HIST_UTIL_H_
#define XGBOOST_COMMON_HIST_UTIL_H_
#include <xgboost/data.h>
#include <algorithm>
#include <cstdint> // for uint32_t
#include <limits>
#include <map>
#include <memory>
#include <utility>
#include <vector>
#include "algorithm.h" // SegmentId
#include "categorical.h"
#include "common.h"
#include "quantile.h"
#include "row_set.h"
#include "threading_utils.h"
#include "timer.h"
#include "xgboost/base.h" // bst_feature_t, bst_bin_t
#include "xgboost/base.h" // for bst_feature_t, bst_bin_t
#include "xgboost/data.h"
namespace xgboost {
class GHistIndexMatrix;
@@ -392,15 +391,18 @@ class HistCollection {
}
// have we computed a histogram for i-th node?
bool RowExists(bst_uint nid) const {
[[nodiscard]] bool RowExists(bst_uint nid) const {
const uint32_t k_max = std::numeric_limits<uint32_t>::max();
return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
}
// initialize histogram collection
void Init(uint32_t nbins) {
if (nbins_ != nbins) {
nbins_ = nbins;
/**
* \brief Initialize histogram collection.
*
* \param n_total_bins Number of bins across all features.
*/
void Init(std::uint32_t n_total_bins) {
if (nbins_ != n_total_bins) {
nbins_ = n_total_bins;
// quite expensive operation, so let's do this only once
data_.clear();
}

View File

@@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
Json JsonReader::Parse() {
while (true) {
SkipSpaces();
char c = PeekNextChar();
auto c = PeekNextChar();
if (c == -1) { break; }
if (c == '{') {
@@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
}
namespace {
bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
} // anonymous namespace
// Json class
void JsonReader::SkipSpaces() {
while (cursor_.Pos() < raw_str_.size()) {
char c = raw_str_[cursor_.Pos()];
Char c = raw_str_[cursor_.Pos()];
if (IsSpace(c)) {
cursor_.Forward();
} else {
@@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
}
Json JsonReader::ParseString() {
char ch { GetConsecutiveChar('\"') }; // NOLINT
Char ch { GetConsecutiveChar('\"') }; // NOLINT
std::string str;
while (true) {
ch = GetNextChar();
if (ch == '\\') {
char next = static_cast<char>(GetNextChar());
Char next{GetNextChar()};
switch (next) {
case 'r': str += u8"\r"; break;
case 'n': str += u8"\n"; break;
@@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
}
Json JsonReader::ParseNull() {
char ch = GetNextNonSpaceChar();
std::string buffer{ch};
Char ch = GetNextNonSpaceChar();
std::string buffer{static_cast<char>(ch)};
for (size_t i = 0; i < 3; ++i) {
buffer.push_back(GetNextChar());
}
@@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
Json JsonReader::ParseArray() {
std::vector<Json> data;
char ch { GetConsecutiveChar('[') }; // NOLINT
Char ch { GetConsecutiveChar('[') }; // NOLINT
while (true) {
if (PeekNextChar() == ']') {
GetConsecutiveChar(']');
@@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {
Object::Map data;
SkipSpaces();
char ch = PeekNextChar();
auto ch = PeekNextChar();
if (ch == '}') {
GetConsecutiveChar('}');
@@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {
Json JsonReader::ParseBoolean() {
bool result = false;
char ch = GetNextNonSpaceChar();
Char ch = GetNextNonSpaceChar();
std::string const t_value = u8"true";
std::string const f_value = u8"false";
@@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
case 'L':
return ParseTypedArray<I64Array>(n);
default:
LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array."; // NOLINT
LOG(FATAL) << "`" + std::string{static_cast<char>(type)} + // NOLINT
"` is not supported for typed array.";
}
}
std::vector<Json> results;
@@ -794,7 +795,7 @@ Json UBJReader::Load() {
Json UBJReader::Parse() {
while (true) {
char c = PeekNextChar();
auto c = PeekNextChar();
if (c == -1) {
break;
}

View File

@@ -1,13 +1,15 @@
/*!
* Copyright 2022, XGBoost contributors.
/**
* Copyright 2022-2023 by XGBoost contributors.
*/
#ifndef XGBOOST_COMMON_NUMERIC_H_
#define XGBOOST_COMMON_NUMERIC_H_
#include <dmlc/common.h> // OMPException
#include <algorithm> // std::max
#include <iterator> // std::iterator_traits
#include <algorithm> // for std::max
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <iterator> // for iterator_traits
#include <vector>
#include "common.h" // AssertGPUSupport
@@ -15,8 +17,7 @@
#include "xgboost/context.h" // Context
#include "xgboost/host_device_vector.h" // HostDeviceVector
namespace xgboost {
namespace common {
namespace xgboost::common {
/**
* \brief Run length encode on CPU, input must be sorted.
@@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
namespace cpu_impl {
template <typename It, typename V = typename It::value_type>
V Reduce(Context const* ctx, It first, It second, V const& init) {
size_t n = std::distance(first, second);
common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
common::ParallelFor(n, ctx->Threads(),
[&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
std::size_t n = std::distance(first, second);
auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
return result;
}
} // namespace cpu_impl
@@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
});
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_COMMON_NUMERIC_H_

View File

@@ -1,391 +1,386 @@
/*!
* Copyright 2021-2022 by Contributors
* \file row_set.h
* \brief Quick Utility to compute subset of rows
* \author Philip Cho, Tianqi Chen
*/
#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
#define XGBOOST_COMMON_PARTITION_BUILDER_H_
#include <xgboost/data.h>
#include <algorithm>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "../tree/hist/expand_entry.h"
#include "categorical.h"
#include "column_matrix.h"
#include "xgboost/context.h"
#include "xgboost/tree_model.h"
namespace xgboost {
namespace common {
// The builder is required for samples partition to left and rights children for set of nodes
// Responsible for:
// 1) Effective memory allocation for intermediate results for multi-thread work
// 2) Merging partial results produced by threads into original row set (row_set_collection_)
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
template<size_t BlockSize>
class PartitionBuilder {
using BitVector = RBitField8;
public:
template<typename Func>
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
left_right_nodes_sizes_.resize(n_nodes);
blocks_offsets_.resize(n_nodes+1);
blocks_offsets_[0] = 0;
for (size_t i = 1; i < n_nodes+1; ++i) {
blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
}
if (n_tasks > max_n_tasks_) {
mem_blocks_.resize(n_tasks);
max_n_tasks_ = n_tasks;
}
}
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
common::Span<const size_t> row_indices,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
size_t base_rowid, Predicate&& pred) {
auto& column = *p_column;
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
auto p_row_indices = row_indices.data();
auto n_samples = row_indices.size();
for (size_t i = 0; i < n_samples; ++i) {
auto rid = p_row_indices[i];
const int32_t bin_id = column[rid - base_rowid];
if (any_missing && bin_id == ColumnType::kMissingId) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
if (pred(rid, bin_id)) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
return {nleft_elems, nright_elems};
}
template <typename Pred>
inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
Pred pred) {
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
for (auto row_id : ridx) {
if (pred(row_id)) {
p_left_part[nleft_elems++] = row_id;
} else {
p_right_part[nright_elems++] = row_id;
}
}
return {nleft_elems, nright_elems};
}
template <typename BinIdxType, bool any_missing, bool any_cat>
void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
const common::Range1d range,
const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool default_left = tree[nid].DefaultLeft();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
auto pred_hist = [&](auto ridx, auto bin_id) {
if (any_cat && is_cat) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx]);
}
return go_left;
} else {
return bin_id <= split_cond;
}
};
auto pred_approx = [&](auto ridx) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
} else {
CHECK_EQ(any_missing, true);
auto column =
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
}
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
/**
* @brief When data is split by column, we don't have all the features locally on the current
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
if (!column_matrix.IsInitialized()) {
for (auto row_id : rid_span) {
auto gidx = gmat.GetGindex(row_id, fid);
if (gidx > -1) {
bool go_left = false;
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
if (go_left) {
decision_bits->Set(row_id - gmat.base_rowid);
}
} else {
missing_bits->Set(row_id - gmat.base_rowid);
}
}
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
}
/**
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
* use them to partition the rows accordingly.
*/
void PartitionByMask(const size_t node_in_set,
std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
BitVector const& missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();
auto pred_approx = [&](auto ridx) {
bool go_left = default_left;
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
if (!is_missing) {
go_left = decision_bits.Check(ridx - gmat.base_rowid);
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
// allocate thread local memory, should be called for each specific task
void AllocateForTask(size_t id) {
if (mem_blocks_[id].get() == nullptr) {
BlockInfo* local_block_ptr = new BlockInfo;
CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
mem_blocks_[id].reset(local_block_ptr);
}
}
common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Left(), end - begin };
}
common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Right(), end - begin };
}
void SetNLeftElems(int nid, size_t begin, size_t n_left) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_left = n_left;
}
void SetNRightElems(int nid, size_t begin, size_t n_right) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_right = n_right;
}
size_t GetNLeftElems(int nid) const {
return left_right_nodes_sizes_[nid].first;
}
size_t GetNRightElems(int nid) const {
return left_right_nodes_sizes_[nid].second;
}
// Each thread has partial results for some set of tree-nodes
// The function decides order of merging partial results into final row set
void CalculateRowOffsets() {
for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
size_t n_left = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
mem_blocks_[j]->n_offset_left = n_left;
n_left += mem_blocks_[j]->n_left;
}
size_t n_right = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
mem_blocks_[j]->n_offset_right = n_left + n_right;
n_right += mem_blocks_[j]->n_right;
}
left_right_nodes_sizes_[i] = {n_left, n_right};
}
}
void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
size_t task_idx = GetTaskIdx(nid, begin);
size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
const size_t* left = mem_blocks_[task_idx]->Left();
const size_t* right = mem_blocks_[task_idx]->Right();
std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
}
size_t GetTaskIdx(int nid, size_t begin) {
return blocks_offsets_[nid] + begin / BlockSize;
}
// Copy row partitions into global cache for reuse in objective
template <typename Sampledp>
void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
auto& h_pos = *p_position;
h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
auto p_begin = row_set.Data()->data();
ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
auto const& node = row_set[i];
if (node.node_id < 0) {
return;
}
CHECK(tree[node.node_id].IsLeaf());
if (node.begin) { // guard for empty node.
size_t ptr_offset = node.end - p_begin;
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
for (auto idx = node.begin; idx != node.end; ++idx) {
h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
}
}
});
}
protected:
struct BlockInfo{
size_t n_left;
size_t n_right;
size_t n_offset_left;
size_t n_offset_right;
size_t* Left() {
return &left_data_[0];
}
size_t* Right() {
return &right_data_[0];
}
private:
size_t left_data_[BlockSize];
size_t right_data_[BlockSize];
};
std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
std::vector<size_t> blocks_offsets_;
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
size_t max_n_tasks_ = 0;
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_
/**
* Copyright 2021-2023 by Contributors
* \file row_set.h
* \brief Quick Utility to compute subset of rows
* \author Philip Cho, Tianqi Chen
*/
#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
#define XGBOOST_COMMON_PARTITION_BUILDER_H_
#include <xgboost/data.h>
#include <algorithm>
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "../tree/hist/expand_entry.h"
#include "categorical.h"
#include "column_matrix.h"
#include "xgboost/context.h"
#include "xgboost/tree_model.h"
namespace xgboost::common {
// The builder is required for samples partition to left and rights children for set of nodes
// Responsible for:
// 1) Effective memory allocation for intermediate results for multi-thread work
// 2) Merging partial results produced by threads into original row set (row_set_collection_)
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
template<size_t BlockSize>
class PartitionBuilder {
using BitVector = RBitField8;
public:
template<typename Func>
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
left_right_nodes_sizes_.resize(n_nodes);
blocks_offsets_.resize(n_nodes+1);
blocks_offsets_[0] = 0;
for (size_t i = 1; i < n_nodes+1; ++i) {
blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
}
if (n_tasks > max_n_tasks_) {
mem_blocks_.resize(n_tasks);
max_n_tasks_ = n_tasks;
}
}
// split row indexes (rid_span) to 2 parts (left_part, right_part) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
// Analog of std::stable_partition, but in no-inplace manner
template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
common::Span<const size_t> row_indices,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
size_t base_rowid, Predicate&& pred) {
auto& column = *p_column;
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
auto p_row_indices = row_indices.data();
auto n_samples = row_indices.size();
for (size_t i = 0; i < n_samples; ++i) {
auto rid = p_row_indices[i];
const int32_t bin_id = column[rid - base_rowid];
if (any_missing && bin_id == ColumnType::kMissingId) {
if (default_left) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
} else {
if (pred(rid, bin_id)) {
p_left_part[nleft_elems++] = rid;
} else {
p_right_part[nright_elems++] = rid;
}
}
}
return {nleft_elems, nright_elems};
}
template <typename Pred>
inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
common::Span<size_t> left_part,
common::Span<size_t> right_part,
Pred pred) {
size_t* p_left_part = left_part.data();
size_t* p_right_part = right_part.data();
size_t nleft_elems = 0;
size_t nright_elems = 0;
for (auto row_id : ridx) {
if (pred(row_id)) {
p_left_part[nleft_elems++] = row_id;
} else {
p_right_part[nright_elems++] = row_id;
}
}
return {nleft_elems, nright_elems};
}
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, const bst_bin_t split_cond,
GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree.SplitIndex(nid);
bool default_left = tree.DefaultLeft(nid);
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
auto pred_hist = [&](auto ridx, auto bin_id) {
if (any_cat && is_cat) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx]);
}
return go_left;
} else {
return bin_id <= split_cond;
}
};
auto pred_approx = [&](auto ridx) {
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
} else {
CHECK_EQ(any_missing, true);
auto column =
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
if (default_left) {
child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
} else {
child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
gmat.base_rowid, pred_hist);
}
}
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
/**
* @brief When data is split by column, we don't have all the features locally on the current
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
template <typename ExpandEntry>
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
if (!column_matrix.IsInitialized()) {
for (auto row_id : rid_span) {
auto gidx = gmat.GetGindex(row_id, fid);
if (gidx > -1) {
bool go_left = false;
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
if (go_left) {
decision_bits->Set(row_id - gmat.base_rowid);
}
} else {
missing_bits->Set(row_id - gmat.base_rowid);
}
}
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
}
/**
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
* use them to partition the rows accordingly.
*/
template <typename ExpandEntry>
void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
BitVector const& missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();
auto pred_approx = [&](auto ridx) {
bool go_left = default_left;
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
if (!is_missing) {
go_left = decision_bits.Check(ridx - gmat.base_rowid);
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
// allocate thread local memory, should be called for each specific task
void AllocateForTask(size_t id) {
if (mem_blocks_[id].get() == nullptr) {
BlockInfo* local_block_ptr = new BlockInfo;
CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
mem_blocks_[id].reset(local_block_ptr);
}
}
common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Left(), end - begin };
}
common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
return { mem_blocks_.at(task_idx)->Right(), end - begin };
}
void SetNLeftElems(int nid, size_t begin, size_t n_left) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_left = n_left;
}
void SetNRightElems(int nid, size_t begin, size_t n_right) {
size_t task_idx = GetTaskIdx(nid, begin);
mem_blocks_.at(task_idx)->n_right = n_right;
}
[[nodiscard]] std::size_t GetNLeftElems(int nid) const {
return left_right_nodes_sizes_[nid].first;
}
[[nodiscard]] std::size_t GetNRightElems(int nid) const {
return left_right_nodes_sizes_[nid].second;
}
// Each thread has partial results for some set of tree-nodes
// The function decides order of merging partial results into final row set
void CalculateRowOffsets() {
for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
size_t n_left = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
mem_blocks_[j]->n_offset_left = n_left;
n_left += mem_blocks_[j]->n_left;
}
size_t n_right = 0;
for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
mem_blocks_[j]->n_offset_right = n_left + n_right;
n_right += mem_blocks_[j]->n_right;
}
left_right_nodes_sizes_[i] = {n_left, n_right};
}
}
void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
size_t task_idx = GetTaskIdx(nid, begin);
size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
const size_t* left = mem_blocks_[task_idx]->Left();
const size_t* right = mem_blocks_[task_idx]->Right();
std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
}
size_t GetTaskIdx(int nid, size_t begin) {
return blocks_offsets_[nid] + begin / BlockSize;
}
// Copy row partitions into global cache for reuse in objective
template <typename Sampledp>
void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
auto& h_pos = *p_position;
h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
auto p_begin = row_set.Data()->data();
ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
auto const& node = row_set[i];
if (node.node_id < 0) {
return;
}
CHECK(tree.IsLeaf(node.node_id));
if (node.begin) { // guard for empty node.
size_t ptr_offset = node.end - p_begin;
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
for (auto idx = node.begin; idx != node.end; ++idx) {
h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
}
}
});
}
protected:
struct BlockInfo{
size_t n_left;
size_t n_right;
size_t n_offset_left;
size_t n_offset_right;
size_t* Left() {
return &left_data_[0];
}
size_t* Right() {
return &right_data_[0];
}
private:
size_t left_data_[BlockSize];
size_t right_data_[BlockSize];
};
std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
std::vector<size_t> blocks_offsets_;
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
size_t max_n_tasks_ = 0;
};
} // namespace xgboost::common
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_

View File

@@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
HistogramCuts *cuts) {
size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
auto &cut_values = cuts->cut_values_.HostVector();
// we use the min_value as the first (0th) element, hence starting from 1.
for (size_t i = 1; i < required_cuts; ++i) {
bst_float cpt = summary.data[i].value;
if (i == 1 || cpt > cut_values.back()) {
@@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
} else {
AddCutPoint<WQSketch>(a, max_num_bins, cuts);
// push a value that is greater than anything
const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
: cuts->min_vals_.HostVector()[fid];
const bst_float cpt =
(a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);

View File

@@ -352,19 +352,6 @@ struct WQSummary {
prev_rmax = data[i].rmax;
}
}
// check consistency of the summary
inline bool Check(const char *msg) const {
const float tol = 10.0f;
for (size_t i = 0; i < this->size; ++i) {
if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
this->Print();
return false;
}
}
return true;
}
};
/*! \brief try to do efficient pruning */

View File

@@ -6,9 +6,7 @@
#include <algorithm> // for copy_n, max, min, none_of, all_of
#include <cstddef> // for size_t
#include <cstdio> // for sscanf
#include <exception> // for exception
#include <functional> // for greater
#include <iterator> // for reverse_iterator
#include <string> // for char_traits, string
#include "algorithm.h" // for ArgSort
@@ -18,12 +16,113 @@
#include "xgboost/base.h" // for bst_group_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/linalg.h" // for All, TensorView, Range, Tensor, Vector
#include "xgboost/logging.h" // for Error, LogCheck_EQ, CHECK_EQ
#include "xgboost/linalg.h" // for All, TensorView, Range
#include "xgboost/logging.h" // for CHECK_EQ
namespace xgboost::ltr {
void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
if (info.group_ptr_.empty()) {
group_ptr_.Resize(2, 0);
group_ptr_.HostVector()[1] = info.num_row_;
} else {
group_ptr_.HostVector() = info.group_ptr_;
}
auto const& gptr = group_ptr_.ConstHostVector();
for (std::size_t i = 1; i < gptr.size(); ++i) {
std::size_t n = gptr[i] - gptr[i - 1];
max_group_size_ = std::max(max_group_size_, n);
}
double sum_weights = 0;
auto n_groups = Groups();
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
for (bst_omp_uint k = 0; k < n_groups; ++k) {
sum_weights += weight[k];
}
weight_norm_ = static_cast<double>(n_groups) / sum_weights;
}
common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
common::Span<float const> predt) {
auto gptr = this->DataGroupPtr(ctx);
auto rank = this->sorted_idx_cache_.HostSpan();
CHECK_EQ(rank.size(), predt.size());
common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
auto cnt = gptr[g + 1] - gptr[g];
auto g_predt = predt.subspan(gptr[g], cnt);
auto g_rank = rank.subspan(gptr[g], cnt);
auto sorted_idx = common::ArgSort<std::size_t>(
ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
CHECK_EQ(g_rank.size(), sorted_idx.size());
std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
});
return rank;
}
#if !defined(XGBOOST_USE_CUDA)
void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
common::Span<float const>) {
common::AssertGPUSupport();
return {};
}
#endif // !defined()
void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
auto const h_group_ptr = this->DataGroupPtr(ctx);
discounts_.Resize(MaxGroupSize(), 0);
auto& h_discounts = discounts_.HostVector();
for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
h_discounts[i] = CalcDCGDiscount(i);
}
auto n_groups = h_group_ptr.size() - 1;
auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
CheckNDCGLabels(this->Param(), h_labels,
[](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
inv_idcg_.Reshape(n_groups);
auto h_inv_idcg = inv_idcg_.HostView();
std::size_t topk = this->Param().TopK();
auto const exp_gain = this->Param().ndcg_exp_gain;
common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
linalg::cend(g_labels), std::greater<>{});
double idcg{0.0};
for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
if (exp_gain) {
idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
} else {
idcg += h_discounts[i] * g_labels(sorted_idx[i]);
}
}
h_inv_idcg(g) = CalcInvIDCG(idcg);
});
}
#if !defined(XGBOOST_USE_CUDA)
void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA)
DMLC_REGISTER_PARAMETER(LambdaRankParam);
void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
}
#if !defined(XGBOOST_USE_CUDA)
void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
#endif // !defined(XGBOOST_USE_CUDA)
std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
std::string out_name;
if (!param.empty()) {

212
src/common/ranking_utils.cu Normal file
View File

@@ -0,0 +1,212 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#include <thrust/functional.h> // for maximum
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of, all_of
#include <thrust/pair.h> // for pair, make_pair
#include <thrust/reduce.h> // for reduce
#include <thrust/scan.h> // for inclusive_scan
#include <cstddef> // for size_t
#include "algorithm.cuh" // for SegmentedArgSort
#include "cuda_context.cuh" // for CUDAContext
#include "device_helpers.cuh" // for MakeTransformIterator, LaunchN
#include "optional_weight.h" // for MakeOptionalWeights, OptionalWeights
#include "ranking_utils.cuh" // for ThreadsForMean
#include "ranking_utils.h"
#include "threading_utils.cuh" // for SegmentedTrapezoidThreads
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
#include "xgboost/context.h" // for Context
#include "xgboost/linalg.h" // for VectorView, All, Range
#include "xgboost/logging.h" // for CHECK
#include "xgboost/span.h" // for Span
namespace xgboost::ltr {
namespace cuda_impl {
void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
common::Span<bst_group_t const> d_group_ptr, std::size_t k,
linalg::VectorView<double> out_dcg) {
CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
using IdxGroup = thrust::pair<std::size_t, std::size_t>;
auto group_it = dh::MakeTransformIterator<IdxGroup>(
thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx)); // NOLINT
});
auto value_it = dh::MakeTransformIterator<double>(
group_it,
[exp_gain, d_labels, d_group_ptr, k,
d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
auto g_begin = d_group_ptr[l.second];
auto g_size = d_group_ptr[l.second + 1] - g_begin;
auto idx_in_group = l.first - g_begin;
if (idx_in_group >= k) {
return 0.0;
}
double gain{0.0};
auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
if (exp_gain) {
gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
} else {
gain = g_labels(g_sorted_idx[idx_in_group]);
}
double discount = CalcDCGDiscount(idx_in_group);
return gain * discount;
});
CHECK(out_dcg.Contiguous());
std::size_t bytes;
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
d_group_ptr.size() - 1, d_group_ptr.data(),
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
dh::TemporaryArray<char> temp(bytes);
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
d_group_ptr.size() - 1, d_group_ptr.data(),
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
}
void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
common::Span<bst_group_t const> d_group_ptr,
linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
CHECK_GE(d_group_ptr.size(), 2ul);
size_t n_groups = d_group_ptr.size() - 1;
CHECK_EQ(out_inv_IDCG.Size(), n_groups);
dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
auto d_sorted_idx = dh::ToSpan(sorted_idx);
common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
[out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
double idcg = out_inv_IDCG(idx);
out_inv_IDCG(idx) = CalcInvIDCG(idcg);
});
}
} // namespace cuda_impl
namespace {
struct CheckNDCGOp {
CUDAContext const* cuctx;
template <typename It, typename Op>
bool operator()(It beg, It end, Op op) {
return thrust::none_of(cuctx->CTP(), beg, end, op);
}
};
struct CheckMAPOp {
CUDAContext const* cuctx;
template <typename It, typename Op>
bool operator()(It beg, It end, Op op) {
return thrust::all_of(cuctx->CTP(), beg, end, op);
}
};
struct ThreadGroupOp {
common::Span<bst_group_t const> d_group_ptr;
std::size_t n_pairs;
common::Span<std::size_t> out_thread_group_ptr;
XGBOOST_DEVICE void operator()(std::size_t i) {
out_thread_group_ptr[i + 1] =
cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
}
};
struct GroupSizeOp {
common::Span<bst_group_t const> d_group_ptr;
XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
return d_group_ptr[i + 1] - d_group_ptr[i];
}
};
struct WeightOp {
common::OptionalWeights d_weight;
XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
};
} // anonymous namespace
void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
CUDAContext const* cuctx = ctx->CUDACtx();
group_ptr_.SetDevice(ctx->gpu_id);
if (info.group_ptr_.empty()) {
group_ptr_.Resize(2, 0);
group_ptr_.HostVector()[1] = info.num_row_;
} else {
auto const& h_group_ptr = info.group_ptr_;
group_ptr_.Resize(h_group_ptr.size());
auto d_group_ptr = group_ptr_.DeviceSpan();
dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
cudaMemcpyHostToDevice, cuctx->Stream()));
}
auto d_group_ptr = DataGroupPtr(ctx);
std::size_t n_groups = Groups();
auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
GroupSizeOp{d_group_ptr});
max_group_size_ =
thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
threads_group_ptr_.SetDevice(ctx->gpu_id);
threads_group_ptr_.Resize(n_groups + 1, 0);
auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
if (param_.HasTruncation()) {
n_cuda_threads_ =
common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
} else {
auto n_pairs = Param().NumPair();
dh::LaunchN(n_groups, cuctx->Stream(),
ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
n_cuda_threads_ = info.num_row_ * param_.NumPair();
}
sorted_idx_cache_.SetDevice(ctx->gpu_id);
sorted_idx_cache_.Resize(info.labels.Size(), 0);
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
auto w_it =
dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
}
common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
common::Span<float const> predt) {
auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
auto d_group_ptr = DataGroupPtr(ctx);
common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
return d_sorted_idx;
}
void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
CUDAContext const* cuctx = ctx->CUDACtx();
auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
auto d_group_ptr = this->DataGroupPtr(ctx);
std::size_t n_groups = d_group_ptr.size() - 1;
inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
CHECK_GE(this->Param().NumPair(), 1ul);
discounts_.SetDevice(ctx->gpu_id);
discounts_.Resize(MaxGroupSize());
auto d_discount = discounts_.DeviceSpan();
dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
[=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
}
void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
}
} // namespace xgboost::ltr

View File

@@ -0,0 +1,40 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
#define XGBOOST_COMMON_RANKING_UTILS_CUH_
#include <cstddef> // for size_t
#include "ranking_utils.h" // for LambdaRankParam
#include "xgboost/base.h" // for bst_group_t, XGBOOST_DEVICE
#include "xgboost/context.h" // for Context
#include "xgboost/linalg.h" // for VectorView
#include "xgboost/span.h" // for Span
namespace xgboost {
namespace ltr {
namespace cuda_impl {
void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
common::Span<bst_group_t const> d_group_ptr, std::size_t k,
linalg::VectorView<double> out_dcg);
void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
common::Span<bst_group_t const> d_group_ptr,
linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
// Functions for creating number of threads for CUDA, and getting back the number of pairs
// from the number of threads.
XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
std::size_t n_pairs) {
return group_size * n_pairs;
}
XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
std::size_t group_size) {
return n_threads / group_size;
}
} // namespace cuda_impl
} // namespace ltr
} // namespace xgboost
#endif // XGBOOST_COMMON_RANKING_UTILS_CUH_

View File

@@ -11,7 +11,6 @@
#include <string> // for char_traits, string
#include <vector> // for vector
#include "./math.h" // for CloseTo
#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD
#include "error_msg.h" // for GroupWeight, GroupSize
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
@@ -19,7 +18,7 @@
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for Vector, VectorView, Tensor
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK
#include "xgboost/logging.h" // for CHECK_EQ, CHECK
#include "xgboost/parameter.h" // for XGBoostParameter
#include "xgboost/span.h" // for Span
#include "xgboost/string_view.h" // for StringView
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t; // NOLINT
*/
using position_t = std::uint32_t; // NOLINT
/**
* \brief Maximum relevance degree for NDCG
*/
constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
static_assert(MaxRel() == 31);
XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
return static_cast<double>((1u << label) - 1);
}
XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
}
XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg)); // handle irrelevant document
return inv_idcg;
}
enum class PairMethod : std::int32_t {
kTopK = 0,
kMean = 1,
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
.describe("Number of pairs for each sample in the list.");
DMLC_DECLARE_FIELD(lambdarank_unbiased)
.set_default(false)
.describe("Unbiased lambda mart. Use IPW to debias click position");
.describe("Unbiased lambda mart. Use extended IPW to debias click position");
DMLC_DECLARE_FIELD(lambdarank_bias_norm)
.set_default(2.0)
.set_lower_bound(0.0)
@@ -126,6 +144,285 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
}
};
/**
* \brief Common cached items for ranking tasks.
*/
class RankingCache {
private:
void InitOnCPU(Context const* ctx, MetaInfo const& info);
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
// Cached parameter
LambdaRankParam param_;
// offset to data groups.
HostDeviceVector<bst_group_t> group_ptr_;
// store the sorted index of prediction.
HostDeviceVector<std::size_t> sorted_idx_cache_;
// Maximum size of group
std::size_t max_group_size_{0};
// Normalization for weight
double weight_norm_{1.0};
/**
* CUDA cache
*/
// offset to threads assigned to each group for gradient calculation
HostDeviceVector<std::size_t> threads_group_ptr_;
// Sorted index of label for finding buckets.
HostDeviceVector<std::size_t> y_sorted_idx_cache_;
// Cached labels sorted by the model
HostDeviceVector<float> y_ranked_by_model_;
// store rounding factor for objective for each group
linalg::Vector<GradientPair> roundings_;
// rounding factor for cost
HostDeviceVector<double> cost_rounding_;
// temporary storage for creating rounding factors. Stored as byte to avoid having cuda
// data structure in here.
HostDeviceVector<std::uint8_t> max_lambdas_;
// total number of cuda threads used for gradient calculation
std::size_t n_cuda_threads_{0};
// Create model rank list on GPU
common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
common::Span<float const> predt);
// Create model rank list on CPU
common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
common::Span<float const> predt);
protected:
[[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
public:
RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
CHECK(param_.GetInitialised());
if (!info.group_ptr_.empty()) {
CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
<< error::GroupSize() << "the size of label.";
}
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
this->InitOnCUDA(ctx, info);
}
if (!info.weights_.Empty()) {
CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
}
}
[[nodiscard]] std::size_t MaxPositionSize() const {
// Use truncation level as bound.
if (param_.HasTruncation()) {
return param_.NumPair();
}
// Hardcoded maximum size of positions to track. We don't need too many of them as the
// bias decreases exponentially.
return std::min(max_group_size_, static_cast<std::size_t>(32));
}
// Constructed as [1, n_samples] if group ptr is not supplied by the user
common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
group_ptr_.SetDevice(ctx->gpu_id);
return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
}
[[nodiscard]] auto const& Param() const { return param_; }
[[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
[[nodiscard]] double WeightNorm() const { return weight_norm_; }
// Create a rank list by model prediction
common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
if (sorted_idx_cache_.Empty()) {
sorted_idx_cache_.SetDevice(ctx->gpu_id);
sorted_idx_cache_.Resize(predt.size());
}
if (ctx->IsCPU()) {
return this->MakeRankOnCPU(ctx, predt);
} else {
return this->MakeRankOnCUDA(ctx, predt);
}
}
// The function simply returns a uninitialized buffer as this is only used by the
// objective for creating pairs.
common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
CHECK(ctx->IsCUDA());
if (y_sorted_idx_cache_.Empty()) {
y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
y_sorted_idx_cache_.Resize(n_samples);
}
return y_sorted_idx_cache_.DeviceSpan();
}
common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
CHECK(ctx->IsCUDA());
if (y_ranked_by_model_.Empty()) {
y_ranked_by_model_.SetDevice(ctx->gpu_id);
y_ranked_by_model_.Resize(n_samples);
}
return y_ranked_by_model_.DeviceSpan();
}
// CUDA cache getters, the cache is shared between metric and objective, some of these
// fields are lazy initialized to avoid unnecessary allocation.
[[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
CHECK(!threads_group_ptr_.Empty());
return threads_group_ptr_.ConstDeviceSpan();
}
[[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
if (roundings_.Size() == 0) {
roundings_.SetDevice(ctx->gpu_id);
roundings_.Reshape(Groups());
}
return roundings_.View(ctx->gpu_id);
}
common::Span<double> CUDACostRounding(Context const* ctx) {
if (cost_rounding_.Size() == 0) {
cost_rounding_.SetDevice(ctx->gpu_id);
cost_rounding_.Resize(1);
}
return cost_rounding_.DeviceSpan();
}
template <typename Type>
common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
max_lambdas_.SetDevice(ctx->gpu_id);
std::size_t bytes = n * sizeof(Type);
if (bytes != max_lambdas_.Size()) {
max_lambdas_.Resize(bytes);
}
return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
}
};
class NDCGCache : public RankingCache {
// NDCG discount
HostDeviceVector<double> discounts_;
// 1.0 / IDCG
linalg::Vector<double> inv_idcg_;
/**
* CUDA cache
*/
// store the intermediate DCG calculation result for metric
linalg::Vector<double> dcg_;
public:
void InitOnCPU(Context const* ctx, MetaInfo const& info);
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
public:
NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
this->InitOnCUDA(ctx, info);
}
}
linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
return inv_idcg_.View(ctx->gpu_id);
}
common::Span<double const> Discount(Context const* ctx) const {
return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
}
linalg::VectorView<double> Dcg(Context const* ctx) {
if (dcg_.Size() == 0) {
dcg_.SetDevice(ctx->gpu_id);
dcg_.Reshape(this->Groups());
}
return dcg_.View(ctx->gpu_id);
}
};
/**
* \brief Validate label for NDCG
*
* \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
* check for both CPU and GPU.
*/
template <typename NoneOf>
void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
NoneOf none_of) {
auto d_labels = labels.Values();
if (p.ndcg_exp_gain) {
auto label_is_integer =
none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
auto l = std::floor(v);
return std::fabs(l - v) > kRtEps || v < 0.0f;
});
CHECK(label_is_integer)
<< "When using relevance degree as target, label must be either 0 or positive integer.";
}
if (p.ndcg_exp_gain) {
auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
[] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
<< " when the exponential NDCG gain function is used. "
<< "Set `ndcg_exp_gain` to false to use custom DCG gain.";
}
}
template <typename AllOf>
bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
auto s_label = label.Values();
return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
});
}
/**
* \brief Validate label for MAP
*
* \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
* both CPU and GPU.
*/
template <typename AllOf>
void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
auto s_label = label.Values();
auto is_binary = IsBinaryRel(label, all_of);
CHECK(is_binary) << "MAP can only be used with binary labels.";
}
class MAPCache : public RankingCache {
// Total number of relevant documents for each group
HostDeviceVector<double> n_rel_;
// \sum l_k/k
HostDeviceVector<double> acc_;
HostDeviceVector<double> map_;
// Number of samples in this dataset.
std::size_t n_samples_{0};
void InitOnCPU(Context const* ctx, MetaInfo const& info);
void InitOnCUDA(Context const* ctx, MetaInfo const& info);
public:
MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
this->InitOnCUDA(ctx, info);
}
}
common::Span<double> NumRelevant(Context const* ctx) {
if (n_rel_.Empty()) {
n_rel_.SetDevice(ctx->gpu_id);
n_rel_.Resize(n_samples_);
}
return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
}
common::Span<double> Acc(Context const* ctx) {
if (acc_.Empty()) {
acc_.SetDevice(ctx->gpu_id);
acc_.Resize(n_samples_);
}
return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
}
common::Span<double> Map(Context const* ctx) {
if (map_.Empty()) {
map_.SetDevice(ctx->gpu_id);
map_.Resize(this->Groups());
}
return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
}
};
/**
* \brief Parse name for ranking metric given parameters.
*

View File

@@ -8,9 +8,11 @@
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdint> // std::int32_t
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <limits>
#include <type_traits> // std::is_signed
#include <new> // for bad_alloc
#include <type_traits> // for is_signed
#include <vector>
#include "xgboost/logging.h"
@@ -266,7 +268,7 @@ class MemStackAllocator {
if (MaxStackSize >= required_size_) {
ptr_ = stack_mem_;
} else {
ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
}
if (!ptr_) {
throw std::bad_alloc{};
@@ -278,7 +280,7 @@ class MemStackAllocator {
~MemStackAllocator() {
if (required_size_ > MaxStackSize) {
free(ptr_);
std::free(ptr_);
}
}
T& operator[](size_t i) { return ptr_[i]; }

View File

@@ -10,13 +10,16 @@
#include <cstring>
#include "../collective/communicator-inl.h"
#include "../common/algorithm.h" // StableSort
#include "../common/api_entry.h" // XGBAPIThreadLocalEntry
#include "../collective/communicator.h"
#include "../common/common.h"
#include "../common/algorithm.h" // for StableSort
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/error_msg.h" // for InfInData
#include "../common/group_data.h"
#include "../common/io.h"
#include "../common/linalg_op.h"
#include "../common/math.h"
#include "../common/numeric.h" // Iota
#include "../common/numeric.h" // for Iota
#include "../common/threading_utils.h"
#include "../common/version.h"
#include "../data/adapter.h"
@@ -700,6 +703,14 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
}
}
void MetaInfo::SynchronizeNumberOfColumns() {
if (collective::IsFederated() && data_split_mode == DataSplitMode::kCol) {
collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
} else {
collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
}
}
void MetaInfo::Validate(std::int32_t device) const {
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
@@ -867,7 +878,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
data::FileAdapter adapter(parser.get());
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
cache_file);
cache_file, data_split_mode);
} else {
data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
file_format};
@@ -903,11 +914,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
LOG(FATAL) << "Encountered parser error:\n" << e.what();
}
/* sync up number of features after matrix loaded.
* partitioned data will fail the train/val validation check
* since partitioned data not knowing the real number of features. */
collective::Allreduce<collective::Operation::kMax>(&dmat->Info().num_col_, 1);
if (need_split && data_split_mode == DataSplitMode::kCol) {
if (!cache_file.empty()) {
LOG(FATAL) << "Column-wise data split is not support for external memory.";
@@ -917,7 +923,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
delete dmat;
return sliced;
} else {
dmat->Info().data_split_mode = data_split_mode;
return dmat;
}
}
@@ -954,39 +959,49 @@ template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&) {
return new data::SimpleDMatrix(adapter, missing, nthread);
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&,
DataSplitMode data_split_mode) {
return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
}
template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::ArrayAdapter>(data::ArrayAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::DataTableAdapter>(data::DataTableAdapter* adapter,
float missing, std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(data::CSRArrayAdapter* adapter,
float missing, std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter* adapter,
float missing, std::int32_t nthread,
const std::string& cache_prefix);
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create(
data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
float missing, int nthread, const std::string& cache_prefix);
float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&);
data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
DataSplitMode data_split_mode);
SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
SparsePage transpose;
@@ -1048,6 +1063,13 @@ void SparsePage::SortIndices(int32_t n_threads) {
});
}
void SparsePage::Reindex(uint64_t feature_offset, int32_t n_threads) {
auto& h_data = this->data.HostVector();
common::ParallelFor(h_data.size(), n_threads, [&](auto i) {
h_data[i].index += feature_offset;
});
}
void SparsePage::SortRows(int32_t n_threads) {
auto& h_offset = this->offset.HostVector();
auto& h_data = this->data.HostVector();
@@ -1144,7 +1166,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
});
}
exec.Rethrow();
CHECK(valid) << "Input data contains `inf` or `nan`";
CHECK(valid) << error::InfInData();
for (const auto & max : max_columns_vector) {
max_columns = std::max(max_columns, max[0]);
}

View File

@@ -208,17 +208,17 @@ void MetaInfo::SetInfoFromCUDA(Context const& ctx, StringView key, Json array) {
template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
const std::string& cache_prefix) {
const std::string& cache_prefix, DataSplitMode data_split_mode) {
CHECK_EQ(cache_prefix.size(), 0)
<< "Device memory construction is not currently supported with external "
"memory.";
return new data::SimpleDMatrix(adapter, missing, nthread);
return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
}
template DMatrix* DMatrix::Create<data::CudfAdapter>(
data::CudfAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
const std::string& cache_prefix, DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CupyAdapter>(
data::CupyAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
const std::string& cache_prefix, DataSplitMode data_split_mode);
} // namespace xgboost

View File

@@ -4,7 +4,10 @@
*/
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
#include <cstddef> // for size_t
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <string>
@@ -240,6 +243,20 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
return row_stride;
}
/**
* \brief Check there's no inf in data.
*/
template <typename AdapterBatchT>
bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
auto counting = thrust::make_counting_iterator(0llu);
auto value_iter = dh::MakeTransformIterator<float>(
counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
auto valid =
thrust::none_of(value_iter, value_iter + batch.Size(),
[is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
return valid;
}
}; // namespace data
} // namespace xgboost
#endif // XGBOOST_DATA_DEVICE_ADAPTER_H_

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019-2022 XGBoost contributors
/**
* Copyright 2019-2023 by XGBoost contributors
*/
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
@@ -9,7 +9,7 @@
#include "../common/random.h"
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "./ellpack_page.cuh"
#include "device_adapter.cuh"
#include "device_adapter.cuh" // for HasInfInData
#include "gradient_index.h"
#include "xgboost/data.h"
@@ -203,9 +203,8 @@ struct TupleScanOp {
// Here the data is already correctly ordered and simply needs to be compacted
// to remove missing data
template <typename AdapterBatchT>
void CopyDataToEllpack(const AdapterBatchT &batch,
common::Span<FeatureType const> feature_types,
EllpackPageImpl *dst, int device_idx, float missing) {
void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
EllpackPageImpl* dst, int device_idx, float missing) {
// Some witchcraft happens here
// The goal is to copy valid elements out of the input to an ELLPACK matrix
// with a given row stride, using no extra working memory Standard stream
@@ -215,6 +214,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
// correct output position
auto counting = thrust::make_counting_iterator(0llu);
data::IsValidFunctor is_valid(missing);
bool valid = data::HasInfInData(batch, is_valid);
CHECK(valid) << error::InfInData();
auto key_iter = dh::MakeTransformIterator<size_t>(
counting,
[=] __device__(size_t idx) {
@@ -255,9 +257,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
TupleScanOp<Tuple>, cub::NullType, int64_t>;
#if THRUST_MAJOR_VERSION >= 2
DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
nullptr);
dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
nullptr));
#else
DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
@@ -265,9 +267,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
#endif
dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
#if THRUST_MAJOR_VERSION >= 2
DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
key_value_index_iter, out, TupleScanOp<Tuple>(),
cub::NullType(), batch.Size(), nullptr);
dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
key_value_index_iter, out, TupleScanOp<Tuple>(),
cub::NullType(), batch.Size(), nullptr));
#else
DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
key_value_index_iter, out, TupleScanOp<Tuple>(),

View File

@@ -1,21 +1,23 @@
/*!
* Copyright 2017-2022 by XGBoost Contributors
/**
* Copyright 2017-2023 by XGBoost Contributors
* \brief Data type for fast histogram aggregation.
*/
#ifndef XGBOOST_DATA_GRADIENT_INDEX_H_
#define XGBOOST_DATA_GRADIENT_INDEX_H_
#include <algorithm> // std::min
#include <cinttypes> // std::uint32_t
#include <cstddef> // std::size_t
#include <algorithm> // for min
#include <atomic> // for atomic
#include <cinttypes> // for uint32_t
#include <cstddef> // for size_t
#include <memory>
#include <vector>
#include "../common/categorical.h"
#include "../common/error_msg.h" // for InfInData
#include "../common/hist_util.h"
#include "../common/numeric.h"
#include "../common/threading_utils.h"
#include "../common/transform_iterator.h" // common::MakeIndexTransformIter
#include "../common/transform_iterator.h" // for MakeIndexTransformIter
#include "adapter.h"
#include "proxy_dmatrix.h"
#include "xgboost/base.h"
@@ -62,6 +64,7 @@ class GHistIndexMatrix {
BinIdxType* index_data = index_data_span.data();
auto const& ptrs = cut.Ptrs();
auto const& values = cut.Values();
std::atomic<bool> valid{true};
common::ParallelFor(batch_size, batch_threads, [&](size_t i) {
auto line = batch.GetLine(i);
size_t ibegin = row_ptr[rbegin + i]; // index of first entry for current block
@@ -70,6 +73,9 @@ class GHistIndexMatrix {
for (size_t j = 0; j < line.Size(); ++j) {
data::COOTuple elem = line.GetElement(j);
if (is_valid(elem)) {
if (XGBOOST_EXPECT((std::isinf(elem.value)), false)) {
valid = false;
}
bst_bin_t bin_idx{-1};
if (common::IsCat(ft, elem.column_idx)) {
bin_idx = cut.SearchCatBin(elem.value, elem.column_idx, ptrs, values);
@@ -82,6 +88,8 @@ class GHistIndexMatrix {
}
}
});
CHECK(valid) << error::InfInData();
}
// Gather hit_count from all threads

View File

@@ -190,7 +190,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
// From here on Info() has the correct data shape
Info().num_row_ = accumulated_rows;
Info().num_nonzero_ = nnz;
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
Info().SynchronizeNumberOfColumns();
CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
return f > accumulated_rows;
})) << "Something went wrong during iteration.";
@@ -257,6 +257,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
}
iter.Reset();
CHECK_EQ(rbegin, Info().num_row_);
CHECK_EQ(this->ghist_->Features(), Info().num_col_);
/**
* Generate column matrix

View File

@@ -195,7 +195,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
iter.Reset();
// Synchronise worker columns
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
info_.SynchronizeNumberOfColumns();
}
BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {

View File

@@ -1,27 +1,24 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023 XGBoost contributors
*/
#include <any> // for any, any_cast
#include "device_adapter.cuh"
#include "proxy_dmatrix.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
template <typename Fn>
decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
auto value = dmlc::get<std::shared_ptr<CupyAdapter>>(
proxy->Adapter())->Value();
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
proxy->Adapter())->Value();
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
proxy->Adapter())->Value();
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -1,11 +1,10 @@
/*!
* Copyright 2020-2022, XGBoost contributors
/**
* Copyright 2020-2023, XGBoost contributors
*/
#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
#define XGBOOST_DATA_PROXY_DMATRIX_H_
#include <dmlc/any.h>
#include <any> // for any, any_cast
#include <memory>
#include <string>
#include <utility>
@@ -15,8 +14,7 @@
#include "xgboost/context.h"
#include "xgboost/data.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
/*
* \brief A proxy to external iterator.
*/
@@ -44,7 +42,7 @@ class DataIterProxy {
*/
class DMatrixProxy : public DMatrix {
MetaInfo info_;
dmlc::any batch_;
std::any batch_;
Context ctx_;
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -115,9 +113,7 @@ class DMatrixProxy : public DMatrix {
LOG(FATAL) << "Not implemented.";
return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
}
dmlc::any Adapter() const {
return batch_;
}
std::any Adapter() const { return batch_; }
};
inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
@@ -131,15 +127,13 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
template <typename Fn>
decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
auto value =
dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
if (type_error) {
*type_error = false;
}
return fn(value);
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
auto value = dmlc::get<std::shared_ptr<ArrayAdapter>>(
proxy->Adapter())->Value();
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
if (type_error) {
*type_error = false;
}
@@ -154,6 +148,5 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_PROXY_DMATRIX_H_

View File

@@ -73,6 +73,19 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
return out;
}
void SimpleDMatrix::ReindexFeatures() {
if (collective::IsFederated() && info_.data_split_mode == DataSplitMode::kCol) {
std::vector<uint64_t> buffer(collective::GetWorldSize());
buffer[collective::GetRank()] = info_.num_col_;
collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
if (offset == 0) {
return;
}
sparse_page_->Reindex(offset, ctx_.Threads());
}
}
BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
// since csr is the default data structure so `source_` is always available.
auto begin_iter = BatchIterator<SparsePage>(
@@ -151,7 +164,8 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
}
template <typename AdapterT>
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
DataSplitMode data_split_mode) {
this->ctx_.nthread = nthread;
std::vector<uint64_t> qids;
@@ -217,7 +231,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
// Synchronise worker columns
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
info_.data_split_mode = data_split_mode;
ReindexFeatures();
info_.SynchronizeNumberOfColumns();
if (adapter->NumRows() == kAdapterUnknownSize) {
using IteratorAdapterT
@@ -272,22 +288,31 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
fo->Write(sparse_page_->data.HostVector());
}
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(
IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
*adapter,
float missing, int nthread);
float missing, int nthread, DataSplitMode data_split_mode);
template <>
SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread) {
ctx_.nthread = nthread;
SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode) {
ctx_.nthread = nthread;
auto& offset_vec = sparse_page_->offset.HostVector();
auto& data_vec = sparse_page_->data.HostVector();
@@ -346,7 +371,10 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
}
// Synchronise worker columns
info_.num_col_ = adapter->NumColumns();
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
info_.data_split_mode = data_split_mode;
ReindexFeatures();
info_.SynchronizeNumberOfColumns();
info_.num_row_ = total_batch_size;
info_.num_nonzero_ = data_vec.size();
CHECK_EQ(offset_vec.back(), info_.num_nonzero_);

View File

@@ -15,7 +15,10 @@ namespace data {
// Current implementation assumes a single batch. More batches can
// be supported in future. Does not currently support inferring row/column size
template <typename AdapterT>
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/) {
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/,
DataSplitMode data_split_mode) {
CHECK(data_split_mode != DataSplitMode::kCol)
<< "Column-wise data split is currently not supported on the GPU.";
auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
: adapter->DeviceIdx();
CHECK_GE(device, 0);
@@ -40,12 +43,13 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
info_.num_col_ = adapter->NumColumns();
info_.num_row_ = adapter->NumRows();
// Synchronise worker columns
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
info_.data_split_mode = data_split_mode;
info_.SynchronizeNumberOfColumns();
}
template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
int nthread);
int nthread, DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
int nthread);
int nthread, DataSplitMode data_split_mode);
} // namespace data
} // namespace xgboost

View File

@@ -1,14 +1,13 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
/**
* Copyright 2019-2023 by XGBoost Contributors
* \file simple_dmatrix.cuh
*/
#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
#define XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
#include <thrust/copy.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include "device_adapter.cuh"
#include <thrust/scan.h>
#if defined(XGBOOST_USE_CUDA)
#include "../common/device_helpers.cuh"
@@ -16,8 +15,10 @@
#include "../common/device_helpers.hip.h"
#endif
namespace xgboost {
namespace data {
#include "../common/error_msg.h" // for InfInData
#include "device_adapter.cuh" // for HasInfInData
namespace xgboost::data {
#if defined(XGBOOST_USE_CUDA)
template <typename AdapterBatchT>
@@ -94,7 +95,11 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
}
template <typename AdapterBatchT>
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing, SparsePage* page) {
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
SparsePage* page) {
bool valid = HasInfInData(batch, IsValidFunctor{missing});
CHECK(valid) << error::InfInData();
page->offset.SetDevice(device);
page->data.SetDevice(device);
page->offset.Resize(batch.NumRows() + 1);
@@ -106,6 +111,5 @@ size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missin
return num_nonzero_;
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_SIMPLE_DMATRIX_CUH_

View File

@@ -22,7 +22,8 @@ class SimpleDMatrix : public DMatrix {
public:
SimpleDMatrix() = default;
template <typename AdapterT>
explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread);
explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
DataSplitMode data_split_mode = DataSplitMode::kRow);
explicit SimpleDMatrix(dmlc::Stream* in_stream);
~SimpleDMatrix() override = default;
@@ -61,6 +62,15 @@ class SimpleDMatrix : public DMatrix {
bool GHistIndexExists() const override { return static_cast<bool>(gradient_index_); }
bool SparsePageExists() const override { return true; }
/**
* \brief Reindex the features based on a global view.
*
* In some cases (e.g. vertical federated learning), features are loaded locally with indices
* starting from 0. However, all the algorithms assume the features are globally indexed, so we
* reindex the features based on the offset needed to obtain the global view.
*/
void ReindexFeatures();
private:
Context ctx_;
};

View File

@@ -96,7 +96,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
this->info_.num_col_ = n_features;
this->info_.num_nonzero_ = nnz;
collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
info_.SynchronizeNumberOfColumns();
CHECK_NE(info_.num_col_, 0);
}

View File

@@ -10,6 +10,7 @@
#include <dmlc/parameter.h>
#include <algorithm>
#include <cinttypes> // for uint32_t
#include <limits>
#include <memory>
#include <string>
@@ -27,9 +28,11 @@
#include "xgboost/host_device_vector.h"
#include "xgboost/json.h"
#include "xgboost/logging.h"
#include "xgboost/model.h"
#include "xgboost/objective.h"
#include "xgboost/predictor.h"
#include "xgboost/string_view.h"
#include "xgboost/string_view.h" // for StringView
#include "xgboost/tree_model.h" // for RegTree
#include "xgboost/tree_updater.h"
namespace xgboost::gbm {
@@ -131,6 +134,12 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
// set, since only experts are expected to do so.
return;
}
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}
// tparam_ is set before calling this function.
if (tparam_.tree_method != TreeMethod::kAuto) {
return;
@@ -175,12 +184,12 @@ void GBTree::ConfigureUpdaters() {
case TreeMethod::kExact:
tparam_.updater_seq = "grow_colmaker,prune";
break;
case TreeMethod::kHist:
LOG(INFO) <<
"Tree method is selected to be 'hist', which uses a "
"single updater grow_quantile_histmaker.";
case TreeMethod::kHist: {
LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
"grow_quantile_histmaker.";
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kGPUHist: {
common::AssertGPUSupport();
tparam_.updater_seq = "grow_gpu_hist";
@@ -209,11 +218,9 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
} else {
std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
auto nsize = static_cast<bst_omp_uint>(out_gpair->Size());
const auto &gpair_h = in_gpair->ConstHostVector();
common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
tmp_h[i] = gpair_h[i * n_groups + group_id];
});
const auto& gpair_h = in_gpair->ConstHostVector();
common::ParallelFor(out_gpair->Size(), n_threads,
[&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
}
}
@@ -234,6 +241,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
CHECK_EQ(model_.param.num_parallel_tree, trees.size());
CHECK_EQ(model_.param.num_parallel_tree, 1)
<< "Boosting random forest is not supported for current objective.";
CHECK(!trees.front()->IsMultiTarget()) << "Update tree leaf" << MTNotImplemented();
CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
auto const& position = node_position.at(tree_idx);
@@ -245,17 +253,18 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) {
std::vector<std::vector<std::unique_ptr<RegTree>>> new_trees;
const int ngroup = model_.learner_model_param->num_output_group;
const int ngroup = model_.learner_model_param->OutputLength();
ConfigureWithKnownData(this->cfg_, p_fmat);
monitor_.Start("BoostNewTrees");
// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let
// `gpu_id` be the single source of determining what algorithms to run, but that will
// break a lots of existing code.
auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
auto out = linalg::TensorView<float, 2>{
auto out = linalg::MakeTensorView(
device,
device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
{static_cast<size_t>(p_fmat->Info().num_row_), static_cast<size_t>(ngroup)},
device};
p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
CHECK_NE(ngroup, 0);
if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@@ -266,7 +275,13 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
// position is negated if the row is sampled out.
std::vector<HostDeviceVector<bst_node_t>> node_position;
if (ngroup == 1) {
if (model_.learner_model_param->IsVectorLeaf()) {
std::vector<std::unique_ptr<RegTree>> ret;
BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
// No update prediction cache yet.
new_trees.push_back(std::move(ret));
} else if (model_.learner_model_param->OutputLength() == 1) {
std::vector<std::unique_ptr<RegTree>> ret;
BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -360,8 +375,8 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
<< "Set `process_type` to `update` if you want to update existing "
"trees.";
// create new tree
std::unique_ptr<RegTree> ptr(new RegTree());
ptr->param.UpdateAllowUnknown(this->cfg_);
std::unique_ptr<RegTree> ptr(new RegTree{this->model_.learner_model_param->LeafLength(),
this->model_.learner_model_param->num_feature});
new_trees.push_back(ptr.get());
ret->push_back(std::move(ptr));
} else if (tparam_.process_type == TreeProcessType::kUpdate) {
@@ -383,11 +398,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
}
// update the trees
CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
<< "Mismatching size between number of rows from input data and size of "
"gradient vector.";
auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
StringView msg{
"Mismatching size between number of rows from input data and size of gradient vector."};
if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
CHECK_EQ(n_out % gpair->Size(), 0) << msg;
} else {
CHECK_EQ(gpair->Size(), n_out) << msg;
}
CHECK(out_position);
out_position->resize(new_trees.size());
// Rescale learning rate according to the size of trees
@@ -402,8 +421,12 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
monitor_.Start("CommitModel");
for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
model_.CommitModel(std::move(new_trees[gid]), gid);
if (this->model_.learner_model_param->IsVectorLeaf()) {
model_.CommitModel(std::move(new_trees[0]), 0);
} else {
for (std::uint32_t gid = 0; gid < model_.learner_model_param->OutputLength(); ++gid) {
model_.CommitModel(std::move(new_trees[gid]), gid);
}
}
monitor_.Stop("CommitModel");
}
@@ -564,11 +587,10 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
if (out_preds->version == 0) {
// out_preds->Size() can be non-zero as it's initialized here before any
// tree is built at the 0^th iterator.
predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions,
model_);
predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions, model_);
}
uint32_t tree_begin, tree_end;
std::uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
if (tree_end > tree_begin) {
@@ -577,7 +599,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
if (reset) {
out_preds->version = 0;
} else {
uint32_t delta = layer_end - out_preds->version;
std::uint32_t delta = layer_end - out_preds->version;
out_preds->Update(delta);
}
}
@@ -770,6 +792,7 @@ class Dart : public GBTree {
void PredictBatchImpl(DMatrix *p_fmat, PredictionCacheEntry *p_out_preds,
bool training, unsigned layer_begin,
unsigned layer_end) const {
CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
CHECK(predictor);
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
@@ -830,6 +853,7 @@ class Dart : public GBTree {
void InplacePredict(std::shared_ptr<DMatrix> p_fmat, float missing,
PredictionCacheEntry* p_out_preds, uint32_t layer_begin,
unsigned layer_end) const override {
CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
auto n_groups = model_.learner_model_param->num_output_group;
@@ -996,8 +1020,9 @@ class Dart : public GBTree {
}
// set normalization factors
inline size_t NormalizeTrees(size_t size_new_trees) {
float lr = 1.0 * dparam_.learning_rate / size_new_trees;
std::size_t NormalizeTrees(size_t size_new_trees) {
CHECK(tree_param_.GetInitialised());
float lr = 1.0 * tree_param_.learning_rate / size_new_trees;
size_t num_drop = idx_drop_.size();
if (num_drop == 0) {
for (size_t i = 0; i < size_new_trees; ++i) {

View File

@@ -111,8 +111,6 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
bool one_drop;
/*! \brief probability of skipping the dropout during an iteration */
float skip_drop;
/*! \brief learning step size for a time */
float learning_rate;
// declare parameters
DMLC_DECLARE_PARAMETER(DartTrainParam) {
DMLC_DECLARE_FIELD(sample_type)
@@ -136,24 +134,27 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
.set_range(0.0f, 1.0f)
.set_default(0.0f)
.describe("Probability of skipping the dropout during a boosting iteration.");
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(0.3f)
.describe("Learning rate(step size) of update.");
DMLC_DECLARE_ALIAS(learning_rate, eta);
}
};
namespace detail {
// From here on, layer becomes concrete trees.
inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const &model,
size_t layer_begin,
size_t layer_end) {
bst_group_t groups = model.learner_model_param->num_output_group;
uint32_t tree_begin = layer_begin * groups * model.param.num_parallel_tree;
uint32_t tree_end = layer_end * groups * model.param.num_parallel_tree;
inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const& model,
std::uint32_t layer_begin,
std::uint32_t layer_end) {
std::uint32_t tree_begin;
std::uint32_t tree_end;
if (model.learner_model_param->IsVectorLeaf()) {
tree_begin = layer_begin * model.param.num_parallel_tree;
tree_end = layer_end * model.param.num_parallel_tree;
} else {
bst_group_t groups = model.learner_model_param->OutputLength();
tree_begin = layer_begin * groups * model.param.num_parallel_tree;
tree_end = layer_end * groups * model.param.num_parallel_tree;
}
if (tree_end == 0) {
tree_end = static_cast<uint32_t>(model.trees.size());
tree_end = model.trees.size();
}
if (model.trees.size() != 0) {
CHECK_LE(tree_begin, tree_end);
@@ -241,22 +242,25 @@ class GBTree : public GradientBooster {
void LoadModel(Json const& in) override;
// Number of trees per layer.
auto LayerTrees() const {
auto n_trees = model_.learner_model_param->num_output_group * model_.param.num_parallel_tree;
return n_trees;
[[nodiscard]] std::uint32_t LayerTrees() const {
if (model_.learner_model_param->IsVectorLeaf()) {
return model_.param.num_parallel_tree;
}
return model_.param.num_parallel_tree * model_.learner_model_param->OutputLength();
}
// slice the trees, out must be already allocated
void Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
GradientBooster *out, bool* out_of_bound) const override;
int32_t BoostedRounds() const override {
[[nodiscard]] std::int32_t BoostedRounds() const override {
CHECK_NE(model_.param.num_parallel_tree, 0);
CHECK_NE(model_.learner_model_param->num_output_group, 0);
return model_.trees.size() / this->LayerTrees();
}
bool ModelFitted() const override {
[[nodiscard]] bool ModelFitted() const override {
return !model_.trees.empty() || !model_.trees_to_update.empty();
}

View File

@@ -326,7 +326,7 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
std::string booster;
std::string objective;
// This is a training parameter and is not saved (nor loaded) in the model.
MultiStrategy multi_strategy{MultiStrategy::kComposite};
MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
// declare parameters
DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
@@ -339,12 +339,12 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
.set_default("reg:squarederror")
.describe("Objective function used for obtaining gradient.");
DMLC_DECLARE_FIELD(multi_strategy)
.add_enum("composite", MultiStrategy::kComposite)
.add_enum("monolithic", MultiStrategy::kMonolithic)
.set_default(MultiStrategy::kComposite)
.add_enum("one_output_per_tree", MultiStrategy::kOneOutputPerTree)
.add_enum("multi_output_tree", MultiStrategy::kMultiOutputTree)
.set_default(MultiStrategy::kOneOutputPerTree)
.describe(
"Strategy used for training multi-target models. `mono` means building one single tree "
"for all targets.");
"Strategy used for training multi-target models. `multi_output_tree` means building "
"one single tree for all targets.");
}
};
@@ -440,7 +440,7 @@ class LearnerConfiguration : public Learner {
info.Validate(Ctx()->gpu_id);
// We estimate it from input data.
linalg::Tensor<float, 1> base_score;
UsePtr(obj_)->InitEstimation(info, &base_score);
InitEstimation(info, &base_score);
CHECK_EQ(base_score.Size(), 1);
mparam_.base_score = base_score(0);
CHECK(!std::isnan(mparam_.base_score));
@@ -775,8 +775,6 @@ class LearnerConfiguration : public Learner {
}
CHECK_NE(mparam_.num_feature, 0)
<< "0 feature is supplied. Are you using raw Booster interface?";
// Remove these once binary IO is gone.
cfg_["num_feature"] = common::ToString(mparam_.num_feature);
}
void ConfigureGBM(LearnerTrainParam const& old, Args const& args) {
@@ -859,17 +857,37 @@ class LearnerConfiguration : public Learner {
mparam_.num_target = n_targets;
}
}
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
// Special handling for vertical federated learning.
if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
// We assume labels are only available on worker 0, so the estimation is calculated there
// and added to other workers.
if (collective::GetRank() == 0) {
UsePtr(obj_)->InitEstimation(info, base_score);
collective::Broadcast(base_score->Data()->HostPointer(),
sizeof(bst_float) * base_score->Size(), 0);
} else {
base_score->Reshape(1);
collective::Broadcast(base_score->Data()->HostPointer(),
sizeof(bst_float) * base_score->Size(), 0);
}
} else {
UsePtr(obj_)->InitEstimation(info, base_score);
}
}
};
std::string const LearnerConfiguration::kEvalMetric {"eval_metric"}; // NOLINT
class LearnerIO : public LearnerConfiguration {
private:
std::set<std::string> saved_configs_ = {"num_round"};
// Used to identify the offset of JSON string when
// Will be removed once JSON takes over. Right now we still loads some RDS files from R.
std::string const serialisation_header_ { u8"CONFIG-offset:" };
void ClearCaches() { this->prediction_container_ = PredictionContainer{}; }
public:
explicit LearnerIO(std::vector<std::shared_ptr<DMatrix>> cache) : LearnerConfiguration{cache} {}
@@ -922,6 +940,7 @@ class LearnerIO : public LearnerConfiguration {
}
this->need_configuration_ = true;
this->ClearCaches();
}
void SaveModel(Json* p_out) const override {
@@ -1015,21 +1034,11 @@ class LearnerIO : public LearnerConfiguration {
CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
obj_.reset(ObjFunction::Create(tparam_.objective, &ctx_));
gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_,
&learner_model_param_));
gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_, &learner_model_param_));
gbm_->Load(fi);
if (mparam_.contain_extra_attrs != 0) {
std::vector<std::pair<std::string, std::string> > attr;
fi->Read(&attr);
for (auto& kv : attr) {
const std::string prefix = "SAVED_PARAM_";
if (kv.first.find(prefix) == 0) {
const std::string saved_param = kv.first.substr(prefix.length());
if (saved_configs_.find(saved_param) != saved_configs_.end()) {
cfg_[saved_param] = kv.second;
}
}
}
attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
}
bool warn_old_model { false };
@@ -1098,6 +1107,7 @@ class LearnerIO : public LearnerConfiguration {
cfg_.insert(n.cbegin(), n.cend());
this->need_configuration_ = true;
this->ClearCaches();
}
// Save model into binary format. The code is about to be deprecated by more robust
@@ -1111,16 +1121,6 @@ class LearnerIO : public LearnerConfiguration {
std::vector<std::pair<std::string, std::string> > extra_attr;
mparam.contain_extra_attrs = 1;
{
std::vector<std::string> saved_params;
for (const auto& key : saved_params) {
auto it = cfg_.find(key);
if (it != cfg_.end()) {
mparam.contain_extra_attrs = 1;
extra_attr.emplace_back("SAVED_PARAM_" + key, it->second);
}
}
}
{
// Similar to JSON model IO, we save the objective.
Json j_obj { Object() };
@@ -1305,7 +1305,7 @@ class LearnerImpl : public LearnerIO {
monitor_.Stop("PredictRaw");
monitor_.Start("GetGradient");
obj_->GetGradient(predt.predictions, train->Info(), iter, &gpair_);
GetGradient(predt.predictions, train->Info(), iter, &gpair_);
monitor_.Stop("GetGradient");
TrainingObserver::Instance().Observe(gpair_, "Gradients");
@@ -1484,6 +1484,28 @@ class LearnerImpl : public LearnerIO {
}
private:
void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
HostDeviceVector<GradientPair>* out_gpair) {
// Special handling for vertical federated learning.
if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
// We assume labels are only available on worker 0, so the gradients are calculated there
// and broadcast to other workers.
if (collective::GetRank() == 0) {
obj_->GetGradient(preds, info, iteration, out_gpair);
collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
0);
} else {
CHECK_EQ(info.labels.Size(), 0)
<< "In vertical federated learning, labels should only be on the first worker";
out_gpair->Resize(preds.Size());
collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
0);
}
} else {
obj_->GetGradient(preds, info, iteration, out_gpair);
}
}
/*! \brief random number transformation seed. */
static int32_t constexpr kRandSeedMagic = 127;
// gradient pairs

View File

@@ -20,23 +20,51 @@
// corresponding headers that brings in those function declaration can't be included with CUDA).
// This precludes the CPU and GPU logic to coexist inside a .cu file
#include "rank_metric.h"
#include <dmlc/omp.h>
#include <dmlc/registry.h>
#include <xgboost/metric.h>
#include <cmath>
#include <vector>
#include <algorithm> // for stable_sort, copy, fill_n, min, max
#include <array> // for array
#include <cmath> // for log, sqrt
#include <cstddef> // for size_t, std
#include <cstdint> // for uint32_t
#include <functional> // for less, greater
#include <map> // for operator!=, _Rb_tree_const_iterator
#include <memory> // for allocator, unique_ptr, shared_ptr, __shared_...
#include <numeric> // for accumulate
#include <ostream> // for operator<<, basic_ostream, ostringstream
#include <string> // for char_traits, operator<, basic_string, to_string
#include <utility> // for pair, make_pair
#include <vector> // for vector
#include "../collective/communicator-inl.h"
#include "../common/algorithm.h" // Sort
#include "../common/math.h"
#include "../common/ranking_utils.h" // MakeMetricName
#include "../common/threading_utils.h"
#include "metric_common.h"
#include "xgboost/host_device_vector.h"
#include "../collective/communicator-inl.h" // for IsDistributed, Allreduce
#include "../collective/communicator.h" // for Operation
#include "../common/algorithm.h" // for ArgSort, Sort
#include "../common/linalg_op.h" // for cbegin, cend
#include "../common/math.h" // for CmpFirst
#include "../common/optional_weight.h" // for OptionalWeights, MakeOptionalWeights
#include "../common/ranking_utils.h" // for LambdaRankParam, NDCGCache, ParseMetricName
#include "../common/threading_utils.h" // for ParallelFor
#include "../common/transform_iterator.h" // for IndexTransformIter
#include "dmlc/common.h" // for OMPException
#include "metric_common.h" // for MetricNoCache, GPUMetric, PackedReduceResult
#include "xgboost/base.h" // for bst_float, bst_omp_uint, bst_group_t, Args
#include "xgboost/cache.h" // for DMatrixCache
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo, DMatrix
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Json, FromJson, IsA, ToJson, get, Null, Object
#include "xgboost/linalg.h" // for Tensor, TensorView, Range, VectorView, MakeT...
#include "xgboost/logging.h" // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
#include "xgboost/metric.h" // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
#include "xgboost/span.h" // for Span, operator!=
#include "xgboost/string_view.h" // for StringView
namespace {
using PredIndPair = std::pair<xgboost::bst_float, uint32_t>;
using PredIndPair = std::pair<xgboost::bst_float, xgboost::ltr::rel_degree_t>;
using PredIndPairContainer = std::vector<PredIndPair>;
/*
@@ -87,8 +115,7 @@ class PerGroupWeightPolicy {
} // anonymous namespace
namespace xgboost {
namespace metric {
namespace xgboost::metric {
// tag the this file, used by force static link later.
DMLC_REGISTRY_FILE_TAG(rank_metric);
@@ -257,71 +284,6 @@ struct EvalPrecision : public EvalRank {
}
};
/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
struct EvalNDCG : public EvalRank {
private:
double CalcDCG(const PredIndPairContainer &rec) const {
double sumdcg = 0.0;
for (size_t i = 0; i < rec.size() && i < this->topn; ++i) {
const unsigned rel = rec[i].second;
if (rel != 0) {
sumdcg += ((1 << rel) - 1) / std::log2(i + 2.0);
}
}
return sumdcg;
}
public:
explicit EvalNDCG(const char* name, const char* param) : EvalRank(name, param) {}
double EvalGroup(PredIndPairContainer *recptr) const override {
PredIndPairContainer &rec(*recptr);
std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
double dcg = CalcDCG(rec);
std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
double idcg = CalcDCG(rec);
if (idcg == 0.0f) {
if (this->minus) {
return 0.0f;
} else {
return 1.0f;
}
}
return dcg/idcg;
}
};
/*! \brief Mean Average Precision at N, for both classification and rank */
struct EvalMAP : public EvalRank {
public:
explicit EvalMAP(const char* name, const char* param) : EvalRank(name, param) {}
double EvalGroup(PredIndPairContainer *recptr) const override {
PredIndPairContainer &rec(*recptr);
std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
unsigned nhits = 0;
double sumap = 0.0;
for (size_t i = 0; i < rec.size(); ++i) {
if (rec[i].second != 0) {
nhits += 1;
if (i < this->topn) {
sumap += static_cast<double>(nhits) / (i + 1);
}
}
}
if (nhits != 0) {
sumap /= nhits;
return sumap;
} else {
if (this->minus) {
return 0.0;
} else {
return 1.0;
}
}
}
};
/*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
struct EvalCox : public MetricNoCache {
public:
@@ -377,16 +339,213 @@ XGBOOST_REGISTER_METRIC(Precision, "pre")
.describe("precision@k for rank.")
.set_body([](const char* param) { return new EvalPrecision("pre", param); });
XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
.describe("ndcg@k for rank.")
.set_body([](const char* param) { return new EvalNDCG("ndcg", param); });
XGBOOST_REGISTER_METRIC(MAP, "map")
.describe("map@k for rank.")
.set_body([](const char* param) { return new EvalMAP("map", param); });
XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
.describe("Negative log partial likelihood of Cox proportional hazards model.")
.set_body([](const char*) { return new EvalCox(); });
} // namespace metric
} // namespace xgboost
// ranking metrics that requires cache
template <typename Cache>
class EvalRankWithCache : public Metric {
protected:
ltr::LambdaRankParam param_;
bool minus_{false};
std::string name_;
DMatrixCache<Cache> cache_{DMatrixCache<Cache>::DefaultSize()};
public:
EvalRankWithCache(StringView name, const char* param) {
auto constexpr kMax = ltr::LambdaRankParam::NotSet();
std::uint32_t topn{kMax};
this->name_ = ltr::ParseMetricName(name, param, &topn, &minus_);
if (topn != kMax) {
param_.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", std::to_string(topn)},
{"lambdarank_pair_method", "topk"}});
}
param_.UpdateAllowUnknown(Args{});
}
void Configure(Args const&) override {
// do not configure, otherwise the ndcg param will be forced into the same as the one in
// objective.
}
void LoadConfig(Json const& in) override {
if (IsA<Null>(in)) {
return;
}
auto const& obj = get<Object const>(in);
auto it = obj.find("lambdarank_param");
if (it != obj.cend()) {
FromJson(it->second, &param_);
}
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["name"] = String{this->Name()};
out["lambdarank_param"] = ToJson(param_);
}
double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
auto const& info = p_fmat->Info();
auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
if (p_cache->Param() != param_) {
p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
}
CHECK(p_cache->Param() == param_);
CHECK_EQ(preds.Size(), info.labels.Size());
return this->Eval(preds, info, p_cache);
}
virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
std::shared_ptr<Cache> p_cache) = 0;
};
namespace {
double Finalize(double score, double sw) {
std::array<double, 2> dat{score, sw};
collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
if (sw > 0.0) {
score = score / sw;
}
CHECK_LE(score, 1.0 + kRtEps)
<< "Invalid output score, might be caused by invalid query group weight.";
score = std::min(1.0, score);
return score;
}
} // namespace
/**
* \brief Implement the NDCG score function for learning to rank.
*
* Ties are ignored, which can lead to different result with other implementations.
*/
class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
public:
using EvalRankWithCache::EvalRankWithCache;
const char* Name() const override { return name_.c_str(); }
double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
std::shared_ptr<ltr::NDCGCache> p_cache) override {
if (ctx_->IsCUDA()) {
auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
return Finalize(ndcg.Residue(), ndcg.Weights());
}
// group local ndcg
auto group_ptr = p_cache->DataGroupPtr(ctx_);
bst_group_t n_groups = group_ptr.size() - 1;
auto ndcg_gloc = p_cache->Dcg(ctx_);
std::fill_n(ndcg_gloc.Values().data(), ndcg_gloc.Size(), 0.0);
auto h_inv_idcg = p_cache->InvIDCG(ctx_);
auto p_discount = p_cache->Discount(ctx_).data();
auto h_label = info.labels.HostView();
auto h_predt = linalg::MakeTensorView(ctx_, &preds, preds.Size());
auto weights = common::MakeOptionalWeights(ctx_, info.weights_);
common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
auto g_predt = h_predt.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
auto g_labels = h_label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]), 0);
auto sorted_idx = common::ArgSort<std::size_t>(ctx_, linalg::cbegin(g_predt),
linalg::cend(g_predt), std::greater<>{});
double ndcg{.0};
double inv_idcg = h_inv_idcg(g);
if (inv_idcg <= 0.0) {
ndcg_gloc(g) = minus_ ? 0.0 : 1.0;
return;
}
std::size_t n{std::min(sorted_idx.size(), static_cast<std::size_t>(param_.TopK()))};
if (param_.ndcg_exp_gain) {
for (std::size_t i = 0; i < n; ++i) {
ndcg += p_discount[i] * ltr::CalcDCGGain(g_labels(sorted_idx[i])) * inv_idcg;
}
} else {
for (std::size_t i = 0; i < n; ++i) {
ndcg += p_discount[i] * g_labels(sorted_idx[i]) * inv_idcg;
}
}
ndcg_gloc(g) += ndcg * weights[g];
});
double sum_w{0};
if (weights.Empty()) {
sum_w = n_groups;
} else {
sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
}
auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
return Finalize(ndcg, sum_w);
}
};
class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
public:
using EvalRankWithCache::EvalRankWithCache;
const char* Name() const override { return name_.c_str(); }
double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
std::shared_ptr<ltr::MAPCache> p_cache) override {
if (ctx_->IsCUDA()) {
auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
return Finalize(map.Residue(), map.Weights());
}
auto gptr = p_cache->DataGroupPtr(ctx_);
auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
auto map_gloc = p_cache->Map(ctx_);
std::fill_n(map_gloc.data(), map_gloc.size(), 0.0);
auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
auto g_rank = rank_idx.subspan(gptr[g]);
auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
double n_hits{0.0};
for (std::size_t i = 0; i < n; ++i) {
auto p = g_label(g_rank[i]);
n_hits += p;
map_gloc[g] += n_hits / static_cast<double>((i + 1)) * p;
}
for (std::size_t i = n; i < g_label.Size(); ++i) {
n_hits += g_label(g_rank[i]);
}
if (n_hits > 0.0) {
map_gloc[g] /= std::min(n_hits, static_cast<double>(param_.TopK()));
} else {
map_gloc[g] = minus_ ? 0.0 : 1.0;
}
});
auto sw = 0.0;
auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
if (!weight.Empty()) {
CHECK_EQ(weight.weights.size(), p_cache->Groups());
}
for (std::size_t i = 0; i < map_gloc.size(); ++i) {
map_gloc[i] = map_gloc[i] * weight[i];
sw += weight[i];
}
auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
return Finalize(sum, sw);
}
};
XGBOOST_REGISTER_METRIC(EvalMAP, "map")
.describe("map@k for ranking.")
.set_body([](char const* param) {
return new EvalMAPScore{"map", param};
});
XGBOOST_REGISTER_METRIC(EvalNDCG, "ndcg")
.describe("ndcg@k for ranking.")
.set_body([](char const* param) {
return new EvalNDCG{"ndcg", param};
});
} // namespace xgboost::metric

View File

@@ -2,22 +2,29 @@
* Copyright 2020-2023 by XGBoost Contributors
*/
#include <dmlc/registry.h>
#include <thrust/iterator/counting_iterator.h> // make_counting_iterator
#include <thrust/reduce.h> // reduce
#include <xgboost/metric.h>
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/reduce.h> // for reduce
#include <cstddef> // std::size_t
#include <memory> // std::shared_ptr
#include <algorithm> // for transform
#include <cstddef> // for size_t
#include <memory> // for shared_ptr
#include <vector> // for vector
#include "../common/cuda_context.cuh" // CUDAContext
#include "../common/cuda_context.cuh" // for CUDAContext
#include "../common/device_helpers.cuh" // for MakeTransformIterator
#include "../common/optional_weight.h" // for MakeOptionalWeights
#include "../common/ranking_utils.cuh" // for CalcQueriesDCG, NDCGCache
#include "metric_common.h"
#include "xgboost/base.h" // XGBOOST_DEVICE
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "rank_metric.h"
#include "xgboost/base.h" // for XGBOOST_DEVICE
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for MakeTensorView
#include "xgboost/logging.h" // for CHECK
#include "xgboost/metric.h"
namespace xgboost {
namespace metric {
namespace xgboost::metric {
// tag the this file, used by force static link later.
DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);
@@ -134,200 +141,125 @@ struct EvalPrecisionGpu {
}
};
/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
struct EvalNDCGGpu {
public:
static void ComputeDCG(const dh::SegmentSorter<float> &pred_sorter,
const float *dlabels,
const EvalRankConfig &ecfg,
// The order in which labels have to be accessed. The order is determined
// by sorting the predictions or the labels for the entire dataset
const xgboost::common::Span<const uint32_t> &dlabels_sort_order,
dh::caching_device_vector<double> *dcgptr) {
dh::caching_device_vector<double> &dcgs(*dcgptr);
// Group info on device
const auto &dgroups = pred_sorter.GetGroupsSpan();
const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
// First, determine non zero labels in the dataset individually
auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
return (static_cast<unsigned>(dlabels[dlabels_sort_order[idx]]));
}; // NOLINT
// Find each group's DCG value
const auto nitems = pred_sorter.GetNumItems();
auto *ddcgs = dcgs.data().get();
int device_id = -1;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&device_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&device_id));
#endif
// For each group item compute the aggregated precision
dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
const auto group_idx = dgroup_idx[idx];
const auto group_begin = dgroups[group_idx];
const auto ridx = idx - group_begin;
auto label = DetermineNonTrivialLabelLambda(idx);
if (ridx < ecfg.topn && label) {
atomicAdd(&ddcgs[group_idx], ((1 << label) - 1) / std::log2(ridx + 2.0));
}
});
}
static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
const float *dlabels,
const EvalRankConfig &ecfg) {
// Sort the labels and compute IDCG
dh::SegmentSorter<float> segment_label_sorter;
segment_label_sorter.SortItems(dlabels, pred_sorter.GetNumItems(),
pred_sorter.GetGroupSegmentsSpan());
uint32_t ngroups = pred_sorter.GetNumGroups();
dh::caching_device_vector<double> idcg(ngroups, 0);
ComputeDCG(pred_sorter, dlabels, ecfg, segment_label_sorter.GetOriginalPositionsSpan(), &idcg);
// Compute the DCG values next
dh::caching_device_vector<double> dcg(ngroups, 0);
ComputeDCG(pred_sorter, dlabels, ecfg, pred_sorter.GetOriginalPositionsSpan(), &dcg);
double *ddcg = dcg.data().get();
double *didcg = idcg.data().get();
int device_id = -1;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&device_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&device_id));
#endif
// Compute the group's DCG and reduce it across all groups
dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
if (didcg[gidx] == 0.0f) {
ddcg[gidx] = (ecfg.minus) ? 0.0f : 1.0f;
} else {
ddcg[gidx] /= didcg[gidx];
}
});
// Allocator to be used for managing space overhead while performing reductions
dh::XGBCachingDeviceAllocator<char> alloc;
#if defined(XGBOOST_USE_CUDA)
return thrust::reduce(thrust::cuda::par(alloc), dcg.begin(), dcg.end());
#elif defined(XGBOOST_USE_HIP)
return thrust::reduce(thrust::hip::par(alloc), dcg.begin(), dcg.end());
#endif
}
};
/*! \brief Mean Average Precision at N, for both classification and rank */
struct EvalMAPGpu {
public:
static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
const float *dlabels,
const EvalRankConfig &ecfg) {
// Group info on device
const auto &dgroups = pred_sorter.GetGroupsSpan();
const auto ngroups = pred_sorter.GetNumGroups();
const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
// Original positions of the predictions after they have been sorted
const auto &dpreds_orig_pos = pred_sorter.GetOriginalPositionsSpan();
// First, determine non zero labels in the dataset individually
const auto nitems = pred_sorter.GetNumItems();
dh::caching_device_vector<uint32_t> hits(nitems, 0);
auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
return (static_cast<unsigned>(dlabels[dpreds_orig_pos[idx]]) != 0) ? 1 : 0;
}; // NOLINT
thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
thrust::make_counting_iterator(nitems),
hits.begin(),
DetermineNonTrivialLabelLambda);
// Allocator to be used by sort for managing space overhead while performing prefix scans
dh::XGBCachingDeviceAllocator<char> alloc;
// Next, prefix scan the nontrivial labels that are segmented to accumulate them.
// This is required for computing the metric sum
// Data segmented into different groups...
#if defined(XGBOOST_USE_CUDA)
thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
hits.begin(), // Input value
hits.begin()); // In-place scan
#elif defined(XGBOOST_USE_HIP)
thrust::inclusive_scan_by_key(thrust::hip::par(alloc),
dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
hits.begin(), // Input value
hits.begin()); // In-place scan
#endif
// Find each group's metric sum
dh::caching_device_vector<double> sumap(ngroups, 0);
auto *dsumap = sumap.data().get();
const auto *dhits = hits.data().get();
int device_id = -1;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&device_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&device_id));
#endif
// For each group item compute the aggregated precision
dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
if (DetermineNonTrivialLabelLambda(idx)) {
const auto group_idx = dgroup_idx[idx];
const auto group_begin = dgroups[group_idx];
const auto ridx = idx - group_begin;
if (ridx < ecfg.topn) {
atomicAdd(&dsumap[group_idx],
static_cast<double>(dhits[idx]) / (ridx + 1));
}
}
});
// Aggregate the group's item precisions
dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
auto nhits = dgroups[gidx + 1] ? dhits[dgroups[gidx + 1] - 1] : 0;
if (nhits != 0) {
dsumap[gidx] /= nhits;
} else {
if (ecfg.minus) {
dsumap[gidx] = 0;
} else {
dsumap[gidx] = 1;
}
}
});
#if defined(XGBOOST_USE_CUDA)
return thrust::reduce(thrust::cuda::par(alloc), sumap.begin(), sumap.end());
#elif defined(XGBOOST_USE_HIP)
return thrust::reduce(thrust::hip::par(alloc), sumap.begin(), sumap.end());
#endif
}
};
XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
.describe("precision@k for rank computed on GPU.")
.set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });
XGBOOST_REGISTER_GPU_METRIC(NDCGGpu, "ndcg")
.describe("ndcg@k for rank computed on GPU.")
.set_body([](const char* param) { return new EvalRankGpu<EvalNDCGGpu>("ndcg", param); });
namespace cuda_impl {
PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt, bool minus,
std::shared_ptr<ltr::NDCGCache> p_cache) {
CHECK(p_cache);
XGBOOST_REGISTER_GPU_METRIC(MAPGpu, "map")
.describe("map@k for rank computed on GPU.")
.set_body([](const char* param) { return new EvalRankGpu<EvalMAPGpu>("map", param); });
} // namespace metric
} // namespace xgboost
auto const &p = p_cache->Param();
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
if (!d_weight.Empty()) {
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
}
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
auto d_inv_idcg = p_cache->InvIDCG(ctx);
auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
auto d_out_dcg = p_cache->Dcg(ctx);
ltr::cuda_impl::CalcQueriesDCG(ctx, d_label, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(),
d_out_dcg);
auto it = dh::MakeTransformIterator<PackedReduceResult>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
if (d_inv_idcg(i) <= 0.0) {
return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[i])};
}
return PackedReduceResult{d_out_dcg(i) * d_inv_idcg(i) * d_weight[i],
static_cast<double>(d_weight[i])};
});
auto pair = thrust::reduce(ctx->CUDACtx()->CTP(), it, it + d_out_dcg.Size(),
PackedReduceResult{0.0, 0.0});
return pair;
}
PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt, bool minus,
std::shared_ptr<ltr::MAPCache> p_cache) {
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
auto key_it = dh::MakeTransformIterator<std::size_t>(
thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { return dh::SegmentId(d_group_ptr, i); });
auto get_label = [=] XGBOOST_DEVICE(std::size_t i) {
auto g = key_it[i];
auto g_begin = d_group_ptr[g];
auto g_end = d_group_ptr[g + 1];
i -= g_begin;
auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
return g_label(g_rank[i]);
};
auto it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), get_label);
auto cuctx = ctx->CUDACtx();
auto n_rel = p_cache->NumRelevant(ctx);
thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + d_label.Size(), it, n_rel.data());
double topk = p_cache->Param().TopK();
auto map = p_cache->Map(ctx);
thrust::fill_n(cuctx->CTP(), map.data(), map.size(), 0.0);
{
auto val_it = dh::MakeTransformIterator<double>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
auto g = key_it[i];
auto g_begin = d_group_ptr[g];
auto g_end = d_group_ptr[g + 1];
i -= g_begin;
if (i >= topk) {
return 0.0;
}
auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
auto label = g_label(g_rank[i]);
auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
auto nhits = g_n_rel[i];
return nhits / static_cast<double>(i + 1) * label;
});
std::size_t bytes;
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
dh::TemporaryArray<char> temp(bytes);
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
}
PackedReduceResult result{0.0, 0.0};
{
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
if (!d_weight.Empty()) {
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
}
auto val_it = dh::MakeTransformIterator<PackedReduceResult>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t g) {
auto g_begin = d_group_ptr[g];
auto g_end = d_group_ptr[g + 1];
auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
if (!g_n_rel.empty() && g_n_rel.back() > 0.0) {
return PackedReduceResult{map[g] * d_weight[g] / std::min(g_n_rel.back(), topk),
static_cast<double>(d_weight[g])};
}
return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[g])};
});
result =
thrust::reduce(cuctx->CTP(), val_it, val_it + map.size(), PackedReduceResult{0.0, 0.0});
}
return result;
}
} // namespace cuda_impl
} // namespace xgboost::metric

44
src/metric/rank_metric.h Normal file
View File

@@ -0,0 +1,44 @@
#ifndef XGBOOST_METRIC_RANK_METRIC_H_
#define XGBOOST_METRIC_RANK_METRIC_H_
/**
* Copyright 2023 by XGBoost Contributors
*/
#include <memory> // for shared_ptr
#include "../common/common.h" // for AssertGPUSupport
#include "../common/ranking_utils.h" // for NDCGCache, MAPCache
#include "metric_common.h" // for PackedReduceResult
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
namespace xgboost {
namespace metric {
namespace cuda_impl {
PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt, bool minus,
std::shared_ptr<ltr::NDCGCache> p_cache);
PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt, bool minus,
std::shared_ptr<ltr::MAPCache> p_cache);
#if !defined(XGBOOST_USE_CUDA)
inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
HostDeviceVector<float> const &, bool,
std::shared_ptr<ltr::NDCGCache>) {
common::AssertGPUSupport();
return {};
}
inline PackedReduceResult MAPScore(Context const *, MetaInfo const &,
HostDeviceVector<float> const &, bool,
std::shared_ptr<ltr::MAPCache>) {
common::AssertGPUSupport();
return {};
}
#endif
} // namespace cuda_impl
} // namespace metric
} // namespace xgboost
#endif // XGBOOST_METRIC_RANK_METRIC_H_

View File

@@ -33,7 +33,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
new_obj->GetGradient(dummy_predt, info, 0, &gpair);
bst_target_t n_targets = this->Targets(info);
linalg::Vector<float> leaf_weight;
tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
// workaround, we don't support multi-target due to binary model serialization for
// base margin.

View File

@@ -1,52 +1,64 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
*/
#include <dmlc/any.h>
#include <dmlc/omp.h>
#include <algorithm> // for max, fill, min
#include <any> // for any, any_cast
#include <cassert> // for assert
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t, int32_t, uint64_t
#include <memory> // for unique_ptr, shared_ptr
#include <ostream> // for char_traits, operator<<, basic_ostream
#include <typeinfo> // for type_info
#include <vector> // for vector
#include <cstddef>
#include <limits>
#include <mutex>
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../collective/communicator.h" // for Operation
#include "../common/bitfield.h" // for RBitField8
#include "../common/categorical.h" // for IsCat, Decision
#include "../common/common.h" // for DivRoundUp
#include "../common/math.h" // for CheckNAN
#include "../common/threading_utils.h" // for ParallelFor
#include "../data/adapter.h" // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
#include "../gbm/gbtree_model.h" // for GBTreeModel, GBTreeModelParam
#include "cpu_treeshap.h" // for CalculateContributions
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
#include "predict_fn.h" // for GetNextNode, GetNextNodeMulti
#include "xgboost/base.h" // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for Entry, DMatrix, MetaInfo, SparsePage, Batch...
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/learner.h" // for LearnerModelParam
#include "xgboost/linalg.h" // for TensorView, All, VectorView, Tensor
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_NE
#include "xgboost/multi_target_tree_model.h" // for MultiTargetTree
#include "xgboost/predictor.h" // for PredictionCacheEntry, Predictor, PredictorReg
#include "xgboost/span.h" // for Span
#include "xgboost/tree_model.h" // for RegTree, MTNotImplemented, RTreeNodeStat
#include "../collective/communicator-inl.h"
#include "../common/categorical.h"
#include "../common/math.h"
#include "../common/threading_utils.h"
#include "../data/adapter.h"
#include "../data/gradient_index.h"
#include "../gbm/gbtree_model.h"
#include "cpu_treeshap.h" // CalculateContributions
#include "predict_fn.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/logging.h"
#include "xgboost/predictor.h"
#include "xgboost/tree_model.h"
namespace xgboost {
namespace predictor {
namespace xgboost::predictor {
DMLC_REGISTRY_FILE_TAG(cpu_predictor);
namespace scalar {
template <bool has_missing, bool has_categorical>
bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
RegTree::CategoricalSplitMatrix const& cats) {
bst_node_t nid = 0;
while (!tree[nid].IsLeaf()) {
unsigned split_index = tree[nid].SplitIndex();
RegTree::CategoricalSplitMatrix const &cats) {
bst_node_t nidx{0};
while (!tree[nidx].IsLeaf()) {
bst_feature_t split_index = tree[nidx].SplitIndex();
auto fvalue = feat.GetFvalue(split_index);
nid = GetNextNode<has_missing, has_categorical>(
tree[nid], nid, fvalue, has_missing && feat.IsMissing(split_index), cats);
nidx = GetNextNode<has_missing, has_categorical>(
tree[nidx], nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
}
return nid;
return nidx;
}
bst_float PredValue(const SparsePage::Inst &inst,
const std::vector<std::unique_ptr<RegTree>> &trees,
const std::vector<int> &tree_info, int bst_group,
RegTree::FVec *p_feats, unsigned tree_begin,
unsigned tree_end) {
const std::vector<int> &tree_info, std::int32_t bst_group,
RegTree::FVec *p_feats, std::uint32_t tree_begin, std::uint32_t tree_end) {
bst_float psum = 0.0f;
p_feats->Fill(inst);
for (size_t i = tree_begin; i < tree_end; ++i) {
@@ -68,36 +80,80 @@ bst_float PredValue(const SparsePage::Inst &inst,
}
template <bool has_categorical>
bst_float
PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
RegTree::CategoricalSplitMatrix const& cats) {
const bst_node_t leaf = p_feats.HasMissing() ?
GetLeafIndex<true, has_categorical>(tree, p_feats, cats) :
GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
bst_float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
RegTree::CategoricalSplitMatrix const &cats) {
const bst_node_t leaf = p_feats.HasMissing()
? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
: GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
return tree[leaf].LeafValue();
}
} // namespace scalar
void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
const size_t tree_end, std::vector<bst_float> *out_preds,
const size_t predict_offset, const size_t num_group,
const std::vector<RegTree::FVec> &thread_temp,
const size_t offset, const size_t block_size) {
std::vector<bst_float> &preds = *out_preds;
for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
const size_t gid = model.tree_info[tree_id];
auto const &tree = *model.trees[tree_id];
auto const& cats = tree.GetCategoriesMatrix();
auto has_categorical = tree.HasCategoricalSplit();
namespace multi {
template <bool has_missing, bool has_categorical>
bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
RegTree::CategoricalSplitMatrix const &cats) {
bst_node_t nidx{0};
while (!tree.IsLeaf(nidx)) {
unsigned split_index = tree.SplitIndex(nidx);
auto fvalue = feat.GetFvalue(split_index);
nidx = GetNextNodeMulti<has_missing, has_categorical>(
tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
}
return nidx;
}
if (has_categorical) {
for (size_t i = 0; i < block_size; ++i) {
preds[(predict_offset + i) * num_group + gid] +=
PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
template <bool has_categorical>
void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
RegTree::CategoricalSplitMatrix const &cats,
linalg::VectorView<float> out_predt) {
bst_node_t const leaf = p_feats.HasMissing()
? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
: GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
auto leaf_value = tree.LeafValue(leaf);
assert(out_predt.Shape(0) == leaf_value.Shape(0) && "shape mismatch.");
for (size_t i = 0; i < leaf_value.Size(); ++i) {
out_predt(i) += leaf_value(i);
}
}
} // namespace multi
namespace {
void PredictByAllTrees(gbm::GBTreeModel const &model, std::uint32_t const tree_begin,
std::uint32_t const tree_end, std::size_t const predict_offset,
std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
std::size_t const block_size, linalg::MatrixView<float> out_predt) {
for (std::uint32_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
auto const &tree = *model.trees.at(tree_id);
auto const &cats = tree.GetCategoriesMatrix();
bool has_categorical = tree.HasCategoricalSplit();
if (tree.IsMultiTarget()) {
if (has_categorical) {
for (std::size_t i = 0; i < block_size; ++i) {
auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
multi::PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
t_predts);
}
} else {
for (std::size_t i = 0; i < block_size; ++i) {
auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
multi::PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(),
cats, t_predts);
}
}
} else {
for (size_t i = 0; i < block_size; ++i) {
preds[(predict_offset + i) * num_group + gid] +=
PredValueByOneTree<false>(thread_temp[offset + i], tree, cats);
auto const gid = model.tree_info[tree_id];
if (has_categorical) {
for (std::size_t i = 0; i < block_size; ++i) {
out_predt(predict_offset + i, gid) +=
scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
}
} else {
for (std::size_t i = 0; i < block_size; ++i) {
out_predt(predict_offset + i, gid) +=
scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
}
}
}
}
@@ -105,7 +161,7 @@ void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
template <typename DataView>
void FVecFill(const size_t block_size, const size_t batch_offset, const int num_feature,
DataView* batch, const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
DataView *batch, const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
for (size_t i = 0; i < block_size; ++i) {
RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
if (feats.Size() == 0) {
@@ -117,8 +173,8 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
}
template <typename DataView>
void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batch,
const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
for (size_t i = 0; i < block_size; ++i) {
RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
const SparsePage::Inst inst = (*batch)[batch_offset + i];
@@ -126,9 +182,7 @@ void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batc
}
}
namespace {
static size_t constexpr kUnroll = 8;
} // anonymous namespace
static std::size_t constexpr kUnroll = 8;
struct SparsePageView {
bst_row_t base_rowid;
@@ -227,15 +281,13 @@ class AdapterView {
};
template <typename DataView, size_t block_of_rows_size>
void PredictBatchByBlockOfRowsKernel(
DataView batch, std::vector<bst_float> *out_preds,
gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end,
std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads) {
void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
std::uint32_t tree_begin, std::uint32_t tree_end,
std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads,
linalg::TensorView<float, 2> out_predt) {
auto &thread_temp = *p_thread_temp;
int32_t const num_group = model.learner_model_param->num_output_group;
CHECK_EQ(model.param.size_leaf_vector, 0)
<< "size_leaf_vector is enforced to 0 so far";
CHECK_EQ(model.param.size_leaf_vector, 0) << "size_leaf_vector is enforced to 0 so far";
// parallel over local batch
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
const int num_feature = model.learner_model_param->num_feature;
@@ -243,16 +295,13 @@ void PredictBatchByBlockOfRowsKernel(
common::ParallelFor(n_blocks, n_threads, [&](bst_omp_uint block_id) {
const size_t batch_offset = block_id * block_of_rows_size;
const size_t block_size =
std::min(nsize - batch_offset, block_of_rows_size);
const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset,
p_thread_temp);
FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
// process block of rows through all trees to keep cache locality
PredictByAllTrees(model, tree_begin, tree_end, out_preds,
batch_offset + batch.base_rowid, num_group, thread_temp,
fvec_offset, block_size);
PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
fvec_offset, block_size, out_predt);
FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
});
}
@@ -275,7 +324,7 @@ float FillNodeMeanValues(RegTree const *tree, bst_node_t nidx, std::vector<float
}
void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
size_t num_nodes = tree->param.num_nodes;
size_t num_nodes = tree->NumNodes();
if (mean_values->size() == num_nodes) {
return;
}
@@ -283,7 +332,6 @@ void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
FillNodeMeanValues(tree, 0, mean_values);
}
namespace {
// init thread buffers
static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
int prev_thread_temp_size = out->size();
@@ -557,33 +605,6 @@ class ColumnSplitHelper {
class CPUPredictor : public Predictor {
protected:
void PredictGHistIndex(DMatrix *p_fmat, gbm::GBTreeModel const &model, int32_t tree_begin,
int32_t tree_end, std::vector<bst_float> *out_preds) const {
auto const n_threads = this->ctx_->Threads();
constexpr double kDensityThresh = .5;
size_t total =
std::max(p_fmat->Info().num_row_ * p_fmat->Info().num_col_, static_cast<uint64_t>(1));
double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
bool blocked = density > kDensityThresh;
std::vector<RegTree::FVec> feat_vecs;
InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
auto ft = p_fmat->Info().feature_types.ConstHostVector();
for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
if (blocked) {
PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
} else {
PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
}
}
}
void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
if (p_fmat->IsColumnSplit()) {
@@ -592,11 +613,6 @@ class CPUPredictor : public Predictor {
return;
}
if (!p_fmat->PageExists<SparsePage>()) {
this->PredictGHistIndex(p_fmat, model, tree_begin, tree_end, out_preds);
return;
}
auto const n_threads = this->ctx_->Threads();
constexpr double kDensityThresh = .5;
size_t total =
@@ -606,16 +622,38 @@ class CPUPredictor : public Predictor {
std::vector<RegTree::FVec> feat_vecs;
InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
CHECK_EQ(out_preds->size(),
p_fmat->Info().num_row_ * model.learner_model_param->num_output_group);
if (blocked) {
PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
} else {
PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(
SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
std::size_t n_samples = p_fmat->Info().num_row_;
std::size_t n_groups = model.learner_model_param->OutputLength();
CHECK_EQ(out_preds->size(), n_samples * n_groups);
linalg::TensorView<float, 2> out_predt{*out_preds, {n_samples, n_groups}, ctx_->gpu_id};
if (!p_fmat->PageExists<SparsePage>()) {
std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
auto ft = p_fmat->Info().feature_types.ConstHostVector();
for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
if (blocked) {
PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
} else {
PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
}
}
} else {
for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
if (blocked) {
PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
SparsePageView{&batch}, model, tree_begin, tree_end, &feat_vecs, n_threads,
out_predt);
} else {
PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(SparsePageView{&batch}, model,
tree_begin, tree_end, &feat_vecs,
n_threads, out_predt);
}
}
}
}
@@ -623,26 +661,24 @@ class CPUPredictor : public Predictor {
public:
explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
const gbm::GBTreeModel &model, uint32_t tree_begin,
uint32_t tree_end = 0) const override {
auto* out_preds = &predts->predictions;
void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model,
uint32_t tree_begin, uint32_t tree_end = 0) const override {
auto *out_preds = &predts->predictions;
// This is actually already handled in gbm, but large amount of tests rely on the
// behaviour.
if (tree_end == 0) {
tree_end = model.trees.size();
}
this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin,
tree_end);
this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin, tree_end);
}
template <typename Adapter, size_t kBlockSize>
void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
void DispatchedInplacePredict(std::any const &x, std::shared_ptr<DMatrix> p_m,
const gbm::GBTreeModel &model, float missing,
PredictionCacheEntry *out_preds,
uint32_t tree_begin, uint32_t tree_end) const {
PredictionCacheEntry *out_preds, uint32_t tree_begin,
uint32_t tree_end) const {
auto const n_threads = this->ctx_->Threads();
auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model.";
if (p_m) {
@@ -653,13 +689,16 @@ class CPUPredictor : public Predictor {
info.num_row_ = m->NumRows();
this->InitOutPredictions(info, &(out_preds->predictions), model);
}
std::vector<Entry> workspace(m->NumColumns() * kUnroll * n_threads);
auto &predictions = out_preds->predictions.HostVector();
std::vector<RegTree::FVec> thread_temp;
InitThreadTemp(n_threads * kBlockSize, &thread_temp);
std::size_t n_groups = model.learner_model_param->OutputLength();
linalg::TensorView<float, 2> out_predt{predictions, {m->NumRows(), n_groups}, Context::kCpuId};
PredictBatchByBlockOfRowsKernel<AdapterView<Adapter>, kBlockSize>(
AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads),
&predictions, model, tree_begin, tree_end, &thread_temp, n_threads);
AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads), model,
tree_begin, tree_end, &thread_temp, n_threads, out_predt);
}
bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel &model, float missing,
@@ -689,6 +728,7 @@ class CPUPredictor : public Predictor {
void PredictInstance(const SparsePage::Inst& inst,
std::vector<bst_float>* out_preds,
const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
CHECK(!model.learner_model_param->IsVectorLeaf()) << "predict instance" << MTNotImplemented();
std::vector<RegTree::FVec> feat_vecs;
feat_vecs.resize(1, RegTree::FVec());
feat_vecs[0].Init(model.learner_model_param->num_feature);
@@ -701,31 +741,30 @@ class CPUPredictor : public Predictor {
auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
// loop over output groups
for (uint32_t gid = 0; gid < model.learner_model_param->num_output_group; ++gid) {
(*out_preds)[gid] =
PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0], 0, ntree_limit) +
base_score;
(*out_preds)[gid] = scalar::PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0],
0, ntree_limit) +
base_score;
}
}
void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_preds,
const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_preds,
const gbm::GBTreeModel &model, unsigned ntree_limit) const override {
auto const n_threads = this->ctx_->Threads();
std::vector<RegTree::FVec> feat_vecs;
const int num_feature = model.learner_model_param->num_feature;
InitThreadTemp(n_threads, &feat_vecs);
const MetaInfo& info = p_fmat->Info();
const MetaInfo &info = p_fmat->Info();
// number of valid trees
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
ntree_limit = static_cast<unsigned>(model.trees.size());
}
std::vector<bst_float>& preds = out_preds->HostVector();
std::vector<bst_float> &preds = out_preds->HostVector();
preds.resize(info.num_row_ * ntree_limit);
// start collecting the prediction
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
// parallel over local batch
auto page = batch.GetView();
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
common::ParallelFor(page.Size(), n_threads, [&](auto i) {
const int tid = omp_get_thread_num();
auto ridx = static_cast<size_t>(batch.base_rowid + i);
RegTree::FVec &feats = feat_vecs[tid];
@@ -733,23 +772,28 @@ class CPUPredictor : public Predictor {
feats.Init(num_feature);
}
feats.Fill(page[i]);
for (unsigned j = 0; j < ntree_limit; ++j) {
auto const& tree = *model.trees[j];
auto const& cats = tree.GetCategoriesMatrix();
bst_node_t tid = GetLeafIndex<true, true>(tree, feats, cats);
preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
for (std::uint32_t j = 0; j < ntree_limit; ++j) {
auto const &tree = *model.trees[j];
auto const &cats = tree.GetCategoriesMatrix();
bst_node_t nidx;
if (tree.IsMultiTarget()) {
nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), feats, cats);
} else {
nidx = scalar::GetLeafIndex<true, true>(tree, feats, cats);
}
preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
}
feats.Drop(page[i]);
});
}
}
void PredictContribution(DMatrix *p_fmat,
HostDeviceVector<float> *out_contribs,
void PredictContribution(DMatrix *p_fmat, HostDeviceVector<float> *out_contribs,
const gbm::GBTreeModel &model, uint32_t ntree_limit,
std::vector<bst_float> const *tree_weights,
bool approximate, int condition,
unsigned condition_feature) const override {
std::vector<bst_float> const *tree_weights, bool approximate,
int condition, unsigned condition_feature) const override {
CHECK(!model.learner_model_param->IsVectorLeaf())
<< "Predict contribution" << MTNotImplemented();
auto const n_threads = this->ctx_->Threads();
const int num_feature = model.learner_model_param->num_feature;
std::vector<RegTree::FVec> feat_vecs;
@@ -825,11 +869,12 @@ class CPUPredictor : public Predictor {
}
}
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
const gbm::GBTreeModel &model, unsigned ntree_limit,
std::vector<bst_float> const *tree_weights,
bool approximate) const override {
void PredictInteractionContributions(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
const gbm::GBTreeModel &model, unsigned ntree_limit,
std::vector<bst_float> const *tree_weights,
bool approximate) const override {
CHECK(!model.learner_model_param->IsVectorLeaf())
<< "Predict interaction contribution" << MTNotImplemented();
const MetaInfo& info = p_fmat->Info();
const int ngroup = model.learner_model_param->num_output_group;
size_t const ncolumns = model.learner_model_param->num_feature;
@@ -884,5 +929,4 @@ class CPUPredictor : public Predictor {
XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
.describe("Make predictions using CPU.")
.set_body([](Context const *ctx) { return new CPUPredictor(ctx); });
} // namespace predictor
} // namespace xgboost
} // namespace xgboost::predictor

View File

@@ -9,6 +9,7 @@
#include <thrust/fill.h>
#include <thrust/host_vector.h>
#include <any> // for any, any_cast
#include <memory>
#include "../common/bitfield.h"
@@ -431,7 +432,7 @@ class DeviceModel {
this->tree_beg_ = tree_begin;
this->tree_end_ = tree_end;
this->num_group = model.learner_model_param->num_output_group;
this->num_group = model.learner_model_param->OutputLength();
}
};
@@ -792,13 +793,13 @@ class GPUPredictor : public xgboost::Predictor {
}
template <typename Adapter, typename Loader>
void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
const gbm::GBTreeModel &model, float missing,
PredictionCacheEntry *out_preds,
uint32_t tree_begin, uint32_t tree_end) const {
void DispatchedInplacePredict(std::any const& x, std::shared_ptr<DMatrix> p_m,
const gbm::GBTreeModel& model, float missing,
PredictionCacheEntry* out_preds, uint32_t tree_begin,
uint32_t tree_end) const {
uint32_t const output_groups = model.learner_model_param->num_output_group;
auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model.";
CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())

View File

@@ -1,13 +1,12 @@
/*!
* Copyright 2021 by XGBoost Contributors
/**
* Copyright 2021-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_PREDICTOR_PREDICT_FN_H_
#define XGBOOST_PREDICTOR_PREDICT_FN_H_
#include "../common/categorical.h"
#include "xgboost/tree_model.h"
namespace xgboost {
namespace predictor {
namespace xgboost::predictor {
template <bool has_missing, bool has_categorical>
inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
float fvalue, bool is_missing,
@@ -24,6 +23,25 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
}
}
}
} // namespace predictor
} // namespace xgboost
template <bool has_missing, bool has_categorical>
inline XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTree const &tree,
bst_node_t const nidx, float fvalue,
bool is_missing,
RegTree::CategoricalSplitMatrix const &cats) {
if (has_missing && is_missing) {
return tree.DefaultChild(nidx);
} else {
if (has_categorical && common::IsCat(cats.split_type, nidx)) {
auto node_categories =
cats.categories.subspan(cats.node_ptr[nidx].beg, cats.node_ptr[nidx].size);
return common::Decision(node_categories, fvalue) ? tree.LeftChild(nidx)
: tree.RightChild(nidx);
} else {
return tree.LeftChild(nidx) + !(fvalue < tree.SplitCond(nidx));
}
}
}
} // namespace xgboost::predictor
#endif // XGBOOST_PREDICTOR_PREDICT_FN_H_

View File

@@ -1,22 +1,26 @@
/*!
* Copyright 2021-2022 XGBoost contributors
/**
* Copyright 2021-2023 XGBoost contributors
* \file common_row_partitioner.h
* \brief Common partitioner logic for hist and approx methods.
*/
#ifndef XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
#define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
#include <algorithm> // std::all_of
#include <cinttypes> // std::uint32_t
#include <limits> // std::numeric_limits
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/linalg_op.h" // cbegin
#include "../common/numeric.h" // Iota
#include "../common/partition_builder.h"
#include "hist/expand_entry.h" // CPUExpandEntry
#include "xgboost/base.h"
#include "xgboost/context.h" // Context
#include "xgboost/linalg.h" // TensorView
namespace xgboost {
namespace tree {
namespace xgboost::tree {
static constexpr size_t kPartitionBlockSize = 2048;
@@ -34,9 +38,10 @@ class ColumnSplitHelper {
missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
}
template <typename ExpandEntry>
void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
// When data is split by column, we don't have all the feature values in the local worker, so
// we first collect all the decisions and whether the feature is missing into bit vectors.
std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@@ -97,41 +102,47 @@ class CommonRowPartitioner {
}
}
void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
template <typename ExpandEntry>
void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
for (size_t i = 0; i < nodes.size(); ++i) {
const int32_t nid = nodes[i].nid;
const bst_uint fid = tree[nid].SplitIndex();
const bst_float split_pt = tree[nid].SplitCond();
const uint32_t lower_bound = gmat.cut.Ptrs()[fid];
const uint32_t upper_bound = gmat.cut.Ptrs()[fid + 1];
auto const& ptrs = gmat.cut.Ptrs();
auto const& vals = gmat.cut.Values();
for (std::size_t i = 0; i < nodes.size(); ++i) {
bst_node_t const nidx = nodes[i].nid;
bst_feature_t const fidx = tree.SplitIndex(nidx);
float const split_pt = tree.SplitCond(nidx);
std::uint32_t const lower_bound = ptrs[fidx];
std::uint32_t const upper_bound = ptrs[fidx + 1];
bst_bin_t split_cond = -1;
// convert floating-point split_pt into corresponding bin_id
// split_cond = -1 indicates that split_pt is less than all known cut points
CHECK_LT(upper_bound, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
for (auto bound = lower_bound; bound < upper_bound; ++bound) {
if (split_pt == gmat.cut.Values()[bound]) {
split_cond = static_cast<int32_t>(bound);
if (split_pt == vals[bound]) {
split_cond = static_cast<bst_bin_t>(bound);
}
}
(*split_conditions).at(i) = split_cond;
(*split_conditions)[i] = split_cond;
}
}
void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree const* p_tree) {
template <typename ExpandEntry>
void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree const* p_tree) {
const size_t n_nodes = nodes.size();
for (unsigned int i = 0; i < n_nodes; ++i) {
const int32_t nid = nodes[i].nid;
const int32_t nidx = nodes[i].nid;
const size_t n_left = partition_builder_.GetNLeftElems(i);
const size_t n_right = partition_builder_.GetNRightElems(i);
CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild());
row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(),
n_left, n_right);
CHECK_EQ(p_tree->LeftChild(nidx) + 1, p_tree->RightChild(nidx));
row_set_collection_.AddSplit(nidx, p_tree->LeftChild(nidx), p_tree->RightChild(nidx), n_left,
n_right);
}
}
template <typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
auto const& column_matrix = gmat.Transpose();
if (column_matrix.IsInitialized()) {
if (gmat.cut.HasCategorical()) {
@@ -149,10 +160,10 @@ class CommonRowPartitioner {
}
}
template <bool any_cat>
template <bool any_cat, typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
if (column_matrix.AnyMissing()) {
this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
} else {
@@ -160,33 +171,21 @@ class CommonRowPartitioner {
}
}
template <bool any_missing, bool any_cat>
template <bool any_missing, bool any_cat, typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
switch (column_matrix.GetTypeSize()) {
case common::kUint8BinsTypeSize:
this->template UpdatePosition<uint8_t, any_missing, any_cat>(ctx, gmat, column_matrix,
nodes, p_tree);
break;
case common::kUint16BinsTypeSize:
this->template UpdatePosition<uint16_t, any_missing, any_cat>(ctx, gmat, column_matrix,
nodes, p_tree);
break;
case common::kUint32BinsTypeSize:
this->template UpdatePosition<uint32_t, any_missing, any_cat>(ctx, gmat, column_matrix,
nodes, p_tree);
break;
default:
// no default behavior
CHECK(false) << column_matrix.GetTypeSize();
}
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto t) {
using T = decltype(t);
this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes,
p_tree);
});
}
template <typename BinIdxType, bool any_missing, bool any_cat>
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
// 1. Find split condition for each split
size_t n_nodes = nodes.size();
@@ -248,9 +247,9 @@ class CommonRowPartitioner {
AddSplitsToRowSet(nodes, p_tree);
}
auto const& Partitions() const { return row_set_collection_; }
[[nodiscard]] auto const& Partitions() const { return row_set_collection_; }
size_t Size() const {
[[nodiscard]] std::size_t Size() const {
return std::distance(row_set_collection_.begin(), row_set_collection_.end());
}
@@ -263,12 +262,29 @@ class CommonRowPartitioner {
[&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
}
void LeafPartition(Context const* ctx, RegTree const& tree,
linalg::TensorView<GradientPair const, 2> gpair,
std::vector<bst_node_t>* p_out_position) const {
if (gpair.Shape(1) > 1) {
partition_builder_.LeafPartition(
ctx, tree, this->Partitions(), p_out_position, [&](std::size_t idx) -> bool {
auto sample = gpair.Slice(idx, linalg::All());
return std::all_of(linalg::cbegin(sample), linalg::cend(sample),
[](GradientPair const& g) { return g.GetHess() - .0f == .0f; });
});
} else {
auto s = gpair.Slice(linalg::All(), 0);
partition_builder_.LeafPartition(
ctx, tree, this->Partitions(), p_out_position,
[&](std::size_t idx) -> bool { return s(idx).GetHess() - .0f == .0f; });
}
}
void LeafPartition(Context const* ctx, RegTree const& tree,
common::Span<GradientPair const> gpair,
std::vector<bst_node_t>* p_out_position) const {
partition_builder_.LeafPartition(
ctx, tree, this->Partitions(), p_out_position,
[&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
[&](std::size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
}
private:
@@ -278,6 +294,5 @@ class CommonRowPartitioner {
ColumnSplitHelper column_split_helper_;
};
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree
#endif // XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_

View File

@@ -1,111 +1,111 @@
/*!
* Copyright 2021 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_DRIVER_H_
#define XGBOOST_TREE_DRIVER_H_
#include <xgboost/span.h>
#include <queue>
#include <vector>
#include "./param.h"
namespace xgboost {
namespace tree {
template <typename ExpandEntryT>
inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
return lhs.GetNodeId() > rhs.GetNodeId(); // favor small depth
}
template <typename ExpandEntryT>
inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
if (lhs.GetLossChange() == rhs.GetLossChange()) {
return lhs.GetNodeId() > rhs.GetNodeId(); // favor small timestamp
} else {
return lhs.GetLossChange() < rhs.GetLossChange(); // favor large loss_chg
}
}
// Drives execution of tree building on device
template <typename ExpandEntryT>
class Driver {
using ExpandQueue =
std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
std::function<bool(ExpandEntryT, ExpandEntryT)>>;
public:
explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
: param_(param),
max_node_batch_size_(max_node_batch_size),
queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
: LossGuide<ExpandEntryT>) {}
template <typename EntryIterT>
void Push(EntryIterT begin, EntryIterT end) {
for (auto it = begin; it != end; ++it) {
const ExpandEntryT& e = *it;
if (e.split.loss_chg > kRtEps) {
queue_.push(e);
}
}
}
void Push(const std::vector<ExpandEntryT> &entries) {
this->Push(entries.begin(), entries.end());
}
void Push(ExpandEntryT const& e) { queue_.push(e); }
bool IsEmpty() {
return queue_.empty();
}
// Can a child of this entry still be expanded?
// can be used to avoid extra work
bool IsChildValid(ExpandEntryT const& parent_entry) {
if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
return true;
}
// Return the set of nodes to be expanded
// This set has no dependencies between entries so they may be expanded in
// parallel or asynchronously
std::vector<ExpandEntryT> Pop() {
if (queue_.empty()) return {};
// Return a single entry for loss guided mode
if (param_.grow_policy == TrainParam::kLossGuide) {
ExpandEntryT e = queue_.top();
queue_.pop();
if (e.IsValid(param_, num_leaves_)) {
num_leaves_++;
return {e};
} else {
return {};
}
}
// Return nodes on same level for depth wise
std::vector<ExpandEntryT> result;
ExpandEntryT e = queue_.top();
int level = e.depth;
while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
queue_.pop();
if (e.IsValid(param_, num_leaves_)) {
num_leaves_++;
result.emplace_back(e);
}
if (!queue_.empty()) {
e = queue_.top();
}
}
return result;
}
private:
TrainParam param_;
bst_node_t num_leaves_ = 1;
std::size_t max_node_batch_size_;
ExpandQueue queue_;
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_DRIVER_H_
/*!
* Copyright 2021 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_DRIVER_H_
#define XGBOOST_TREE_DRIVER_H_
#include <xgboost/span.h>
#include <queue>
#include <vector>
#include "./param.h"
namespace xgboost {
namespace tree {
template <typename ExpandEntryT>
inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
return lhs.GetNodeId() > rhs.GetNodeId(); // favor small depth
}
template <typename ExpandEntryT>
inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
if (lhs.GetLossChange() == rhs.GetLossChange()) {
return lhs.GetNodeId() > rhs.GetNodeId(); // favor small timestamp
} else {
return lhs.GetLossChange() < rhs.GetLossChange(); // favor large loss_chg
}
}
// Drives execution of tree building on device
template <typename ExpandEntryT>
class Driver {
using ExpandQueue =
std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
std::function<bool(ExpandEntryT, ExpandEntryT)>>;
public:
explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
: param_(param),
max_node_batch_size_(max_node_batch_size),
queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
: LossGuide<ExpandEntryT>) {}
template <typename EntryIterT>
void Push(EntryIterT begin, EntryIterT end) {
for (auto it = begin; it != end; ++it) {
const ExpandEntryT& e = *it;
if (e.split.loss_chg > kRtEps) {
queue_.push(e);
}
}
}
void Push(const std::vector<ExpandEntryT> &entries) {
this->Push(entries.begin(), entries.end());
}
void Push(ExpandEntryT const& e) { queue_.push(e); }
bool IsEmpty() {
return queue_.empty();
}
// Can a child of this entry still be expanded?
// can be used to avoid extra work
bool IsChildValid(ExpandEntryT const& parent_entry) {
if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
return true;
}
// Return the set of nodes to be expanded
// This set has no dependencies between entries so they may be expanded in
// parallel or asynchronously
std::vector<ExpandEntryT> Pop() {
if (queue_.empty()) return {};
// Return a single entry for loss guided mode
if (param_.grow_policy == TrainParam::kLossGuide) {
ExpandEntryT e = queue_.top();
queue_.pop();
if (e.IsValid(param_, num_leaves_)) {
num_leaves_++;
return {e};
} else {
return {};
}
}
// Return nodes on same level for depth wise
std::vector<ExpandEntryT> result;
ExpandEntryT e = queue_.top();
int level = e.depth;
while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
queue_.pop();
if (e.IsValid(param_, num_leaves_)) {
num_leaves_++;
result.emplace_back(e);
}
if (!queue_.empty()) {
e = queue_.top();
}
}
return result;
}
private:
TrainParam param_;
bst_node_t num_leaves_ = 1;
std::size_t max_node_batch_size_;
ExpandQueue queue_;
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_DRIVER_H_

View File

@@ -21,7 +21,8 @@
namespace xgboost {
namespace tree {
namespace cpu_impl {
void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair,
linalg::VectorView<float> out) {
auto n_targets = out.Size();
CHECK_EQ(n_targets, gpair.Shape(1));
@@ -43,8 +44,12 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
}
}
CHECK(h_sum.CContiguous());
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
// In vertical federated learning, only worker 0 needs to call this, no need to do an allreduce.
if (!collective::IsFederated() || info.data_split_mode != DataSplitMode::kCol) {
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
}
for (std::size_t i = 0; i < h_sum.Size(); ++i) {
out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
@@ -64,7 +69,7 @@ inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace cuda_impl
void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
bst_target_t n_targets, linalg::Vector<float>* out) {
out->SetDevice(ctx->gpu_id);
out->Reshape(n_targets);
@@ -72,7 +77,7 @@ void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
gpair.SetDevice(ctx->gpu_id);
auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView())
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
: cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
}
} // namespace tree

View File

@@ -16,6 +16,7 @@
#include "../common/common.h" // AssertGPUSupport
#include "xgboost/base.h" // GradientPair
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/linalg.h" // TensorView
@@ -30,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
/**
* @brief Fit a tree stump as an estimation of base_score.
*/
void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
bst_target_t n_targets, linalg::Vector<float>* out);
} // namespace tree
} // namespace xgboost

View File

@@ -4,22 +4,25 @@
#ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#include <algorithm>
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <numeric>
#include <utility>
#include <vector>
#include <algorithm> // for copy
#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <utility> // for move
#include <vector> // for vector
#include "../../common/categorical.h"
#include "../../common/hist_util.h"
#include "../../common/random.h"
#include "../../data/gradient_index.h"
#include "../constraints.h"
#include "../param.h" // for TrainParam
#include "../split_evaluator.h"
#include "xgboost/context.h"
#include "../../common/categorical.h" // for CatBitField
#include "../../common/hist_util.h" // for GHistRow, HistogramCuts
#include "../../common/linalg_op.h" // for cbegin, cend, begin
#include "../../common/random.h" // for ColumnSampler
#include "../constraints.h" // for FeatureInteractionConstraintHost
#include "../param.h" // for TrainParam
#include "../split_evaluator.h" // for TreeEvaluator
#include "expand_entry.h" // for MultiExpandEntry
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_feature_t
#include "xgboost/context.h" // for COntext
#include "xgboost/linalg.h" // for Constants, Vector
namespace xgboost::tree {
template <typename ExpandEntry>
@@ -410,8 +413,6 @@ class HistEvaluator {
tree[candidate.nid].SplitIndex(), left_weight,
right_weight);
auto max_node = std::max(left_child, tree[candidate.nid].RightChild());
max_node = std::max(candidate.nid, max_node);
snode_.resize(tree.GetNodes().size());
snode_.at(left_child).stats = candidate.split.left_sum;
snode_.at(left_child).root_gain =
@@ -456,6 +457,216 @@ class HistEvaluator {
}
};
class HistMultiEvaluator {
std::vector<double> gain_;
linalg::Matrix<GradientPairPrecise> stats_;
TrainParam const *param_;
FeatureInteractionConstraintHost interaction_constraints_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
Context const *ctx_;
private:
static double MultiCalcSplitGain(TrainParam const &param,
linalg::VectorView<GradientPairPrecise const> left_sum,
linalg::VectorView<GradientPairPrecise const> right_sum,
linalg::VectorView<float> left_weight,
linalg::VectorView<float> right_weight) {
CalcWeight(param, left_sum, left_weight);
CalcWeight(param, right_sum, right_weight);
auto left_gain = CalcGainGivenWeight(param, left_sum, left_weight);
auto right_gain = CalcGainGivenWeight(param, right_sum, right_weight);
return left_gain + right_gain;
}
template <bst_bin_t d_step>
bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
common::Span<common::GHistRow const> hist,
linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
auto const &cut_ptr = cut.Ptrs();
auto const &cut_val = cut.Values();
auto const &min_val = cut.MinValues();
auto sum = linalg::Empty<GradientPairPrecise>(ctx_, 2, hist.size());
auto left_sum = sum.Slice(0, linalg::All());
auto right_sum = sum.Slice(1, linalg::All());
bst_bin_t ibegin, iend;
if (d_step > 0) {
ibegin = static_cast<bst_bin_t>(cut_ptr[fidx]);
iend = static_cast<bst_bin_t>(cut_ptr[fidx + 1]);
} else {
ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
iend = static_cast<bst_bin_t>(cut_ptr[fidx]) - 1;
}
const auto imin = static_cast<bst_bin_t>(cut_ptr[fidx]);
auto n_targets = hist.size();
auto weight = linalg::Empty<float>(ctx_, 2, n_targets);
auto left_weight = weight.Slice(0, linalg::All());
auto right_weight = weight.Slice(1, linalg::All());
for (bst_bin_t i = ibegin; i != iend; i += d_step) {
for (bst_target_t t = 0; t < n_targets; ++t) {
auto t_hist = hist[t];
auto t_p = parent_sum(t);
left_sum(t) += t_hist[i];
right_sum(t) = t_p - left_sum(t);
}
if (d_step > 0) {
auto split_pt = cut_val[i];
auto loss_chg =
MultiCalcSplitGain(*param_, right_sum, left_sum, right_weight, left_weight) -
parent_gain;
p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, left_sum, right_sum);
} else {
float split_pt;
if (i == imin) {
split_pt = min_val[fidx];
} else {
split_pt = cut_val[i - 1];
}
auto loss_chg =
MultiCalcSplitGain(*param_, right_sum, left_sum, left_weight, right_weight) -
parent_gain;
p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, right_sum, left_sum);
}
}
// return true if there's missing. Doesn't handle floating-point error well.
if (d_step == +1) {
return !std::equal(linalg::cbegin(left_sum), linalg::cend(left_sum),
linalg::cbegin(parent_sum));
}
return false;
}
public:
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
auto &entries = *p_entries;
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
auto nidx = entries[nidx_in_set].nid;
features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
}
CHECK(!features.empty());
std::int32_t n_threads = ctx_->Threads();
std::size_t const grain_size = std::max<std::size_t>(1, features.front()->Size() / n_threads);
common::BlockedSpace2d space(
entries.size(), [&](std::size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
grain_size);
std::vector<MultiExpandEntry> tloc_candidates(n_threads * entries.size());
for (std::size_t i = 0; i < entries.size(); ++i) {
for (std::int32_t j = 0; j < n_threads; ++j) {
tloc_candidates[i * n_threads + j] = entries[i];
}
}
common::ParallelFor2d(space, n_threads, [&](std::size_t nidx_in_set, common::Range1d r) {
auto tidx = omp_get_thread_num();
auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
auto best = &entry->split;
auto parent_sum = stats_.Slice(entry->nid, linalg::All());
std::vector<common::GHistRow> node_hist;
for (auto t_hist : hist) {
node_hist.push_back((*t_hist)[entry->nid]);
}
auto features_set = features[nidx_in_set]->ConstHostSpan();
for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) {
auto fidx = features_set[fidx_in_set];
if (!interaction_constraints_.Query(entry->nid, fidx)) {
continue;
}
auto parent_gain = gain_[entry->nid];
bool missing =
this->EnumerateSplit<+1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
if (missing) {
this->EnumerateSplit<-1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
}
}
});
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
for (auto tidx = 0; tidx < n_threads; ++tidx) {
entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
}
}
}
linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
auto n_targets = root_sum.Size();
stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
gain_.resize(1);
linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
CalcWeight(*param_, root_sum, weight.HostView());
auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
gain_.front() = root_gain;
auto h_stats = stats_.HostView();
std::copy(linalg::cbegin(root_sum), linalg::cend(root_sum), linalg::begin(h_stats));
return weight;
}
void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
auto n_targets = p_tree->NumTargets();
auto parent_sum = stats_.Slice(candidate.nid, linalg::All());
auto weight = linalg::Empty<float>(ctx_, 3, n_targets);
auto base_weight = weight.Slice(0, linalg::All());
CalcWeight(*param_, parent_sum, base_weight);
auto left_weight = weight.Slice(1, linalg::All());
auto left_sum =
linalg::MakeVec(candidate.split.left_sum.data(), candidate.split.left_sum.size());
CalcWeight(*param_, left_sum, param_->learning_rate, left_weight);
auto right_weight = weight.Slice(2, linalg::All());
auto right_sum =
linalg::MakeVec(candidate.split.right_sum.data(), candidate.split.right_sum.size());
CalcWeight(*param_, right_sum, param_->learning_rate, right_weight);
p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
candidate.split.DefaultLeft(), base_weight, left_weight, right_weight);
CHECK(p_tree->IsMultiTarget());
auto left_child = p_tree->LeftChild(candidate.nid);
CHECK_GT(left_child, candidate.nid);
auto right_child = p_tree->RightChild(candidate.nid);
CHECK_GT(right_child, candidate.nid);
std::size_t n_nodes = p_tree->Size();
gain_.resize(n_nodes);
gain_[left_child] = CalcGainGivenWeight(*param_, left_sum, left_weight);
gain_[right_child] = CalcGainGivenWeight(*param_, right_sum, right_weight);
if (n_nodes >= stats_.Shape(0)) {
stats_.Reshape(n_nodes * 2, stats_.Shape(1));
}
CHECK_EQ(stats_.Shape(1), n_targets);
auto left_sum_stat = stats_.Slice(left_child, linalg::All());
std::copy(candidate.split.left_sum.cbegin(), candidate.split.left_sum.cend(),
linalg::begin(left_sum_stat));
auto right_sum_stat = stats_.Slice(right_child, linalg::All());
std::copy(candidate.split.right_sum.cbegin(), candidate.split.right_sum.cend(),
linalg::begin(right_sum_stat));
}
explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
std::shared_ptr<common::ColumnSampler> sampler)
: param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
interaction_constraints_.Configure(*param, info.num_col_);
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
param_->colsample_bynode, param_->colsample_bylevel,
param_->colsample_bytree);
}
};
/**
* \brief CPU implementation of update prediction cache, which calculates the leaf value
* for the last tree and accumulates it to prediction vector.

View File

@@ -1,29 +1,51 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023 XGBoost contributors
*/
#ifndef XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
#define XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
#include <utility>
#include "../param.h"
#include <algorithm> // for all_of
#include <ostream> // for ostream
#include <utility> // for move
#include <vector> // for vector
namespace xgboost {
namespace tree {
#include "../param.h" // for SplitEntry, SplitEntryContainer, TrainParam
#include "xgboost/base.h" // for GradientPairPrecise, bst_node_t
struct CPUExpandEntry {
int nid;
int depth;
SplitEntry split;
CPUExpandEntry() = default;
XGBOOST_DEVICE
CPUExpandEntry(int nid, int depth, SplitEntry split)
: nid(nid), depth(depth), split(std::move(split)) {}
CPUExpandEntry(int nid, int depth, float loss_chg)
: nid(nid), depth(depth) {
split.loss_chg = loss_chg;
namespace xgboost::tree {
/**
* \brief Structure for storing tree split candidate.
*/
template <typename Impl>
struct ExpandEntryImpl {
bst_node_t nid;
bst_node_t depth;
[[nodiscard]] float GetLossChange() const {
return static_cast<Impl const*>(this)->split.loss_chg;
}
[[nodiscard]] bst_node_t GetNodeId() const { return nid; }
static bool ChildIsValid(TrainParam const& param, bst_node_t depth, bst_node_t num_leaves) {
if (param.max_depth > 0 && depth >= param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
return true;
}
bool IsValid(const TrainParam& param, int num_leaves) const {
[[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t num_leaves) const {
return static_cast<Impl const*>(this)->IsValidImpl(param, num_leaves);
}
};
struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
SplitEntry split;
CPUExpandEntry() = default;
CPUExpandEntry(bst_node_t nidx, bst_node_t depth, SplitEntry split)
: ExpandEntryImpl{nidx, depth}, split(std::move(split)) {}
CPUExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
[[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
return false;
@@ -40,16 +62,7 @@ struct CPUExpandEntry {
return true;
}
float GetLossChange() const { return split.loss_chg; }
bst_node_t GetNodeId() const { return nid; }
static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
if (param.max_depth > 0 && depth >= param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
return true;
}
friend std::ostream& operator<<(std::ostream& os, const CPUExpandEntry& e) {
friend std::ostream& operator<<(std::ostream& os, CPUExpandEntry const& e) {
os << "ExpandEntry:\n";
os << "nidx: " << e.nid << "\n";
os << "depth: " << e.depth << "\n";
@@ -58,6 +71,54 @@ struct CPUExpandEntry {
return os;
}
};
} // namespace tree
} // namespace xgboost
struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
SplitEntryContainer<std::vector<GradientPairPrecise>> split;
MultiExpandEntry() = default;
MultiExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
[[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
auto is_zero = [](auto const& sum) {
return std::all_of(sum.cbegin(), sum.cend(),
[&](auto const& g) { return g.GetHess() - .0 == .0; });
};
if (is_zero(split.left_sum) || is_zero(split.right_sum)) {
return false;
}
if (split.loss_chg < param.min_split_loss) {
return false;
}
if (param.max_depth > 0 && depth == param.max_depth) {
return false;
}
if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
return false;
}
return true;
}
friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
os << "ExpandEntry: \n";
os << "nidx: " << e.nid << "\n";
os << "depth: " << e.depth << "\n";
os << "loss: " << e.split.loss_chg << "\n";
os << "split cond:" << e.split.split_value << "\n";
os << "split ind:" << e.split.SplitIndex() << "\n";
os << "left_sum: [";
for (auto v : e.split.left_sum) {
os << v << ", ";
}
os << "]\n";
os << "right_sum: [";
for (auto v : e.split.right_sum) {
os << v << ", ";
}
os << "]\n";
return os;
}
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_

View File

@@ -306,9 +306,9 @@ class HistogramBuilder {
// Construct a work space for building histogram. Eventually we should move this
// function into histogram builder once hist tree method supports external memory.
template <typename Partitioner>
template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
std::vector<CPUExpandEntry> const &nodes_to_build) {
std::vector<ExpandEntry> const &nodes_to_build) {
std::vector<size_t> partition_size(nodes_to_build.size(), 0);
for (auto const &partition : partitioners) {
size_t k = 0;

View File

@@ -14,10 +14,12 @@
#include <string>
#include <vector>
#include "xgboost/parameter.h"
#include "xgboost/data.h"
#include "../common/categorical.h"
#include "../common/linalg_op.h"
#include "../common/math.h"
#include "xgboost/data.h"
#include "xgboost/linalg.h"
#include "xgboost/parameter.h"
namespace xgboost {
namespace tree {
@@ -197,12 +199,11 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
}
/*! \brief given the loss change, whether we need to invoke pruning */
bool NeedPrune(double loss_chg, int depth) const {
return loss_chg < this->min_split_loss ||
(this->max_depth != 0 && depth > this->max_depth);
[[nodiscard]] bool NeedPrune(double loss_chg, int depth) const {
return loss_chg < this->min_split_loss || (this->max_depth != 0 && depth > this->max_depth);
}
bst_node_t MaxNodes() const {
[[nodiscard]] bst_node_t MaxNodes() const {
if (this->max_depth == 0 && this->max_leaves == 0) {
LOG(FATAL) << "Max leaves and max depth cannot both be unconstrained.";
}
@@ -292,6 +293,34 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
return CalcWeight(p, sum_grad.GetGrad(), sum_grad.GetHess());
}
/**
* \brief multi-target weight, calculated with learning rate.
*/
inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
float eta, linalg::VectorView<float> out_w) {
for (bst_target_t i = 0; i < out_w.Size(); ++i) {
out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
}
}
/**
* \brief multi-target weight
*/
inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
linalg::VectorView<float> out_w) {
return CalcWeight(p, grad_sum, 1.0f, out_w);
}
inline double CalcGainGivenWeight(TrainParam const &p,
linalg::VectorView<GradientPairPrecise const> sum_grad,
linalg::VectorView<float const> weight) {
double gain{0};
for (bst_target_t i = 0; i < weight.Size(); ++i) {
gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
}
return gain;
}
/*! \brief core statistics used for tree construction */
struct XGBOOST_ALIGNAS(16) GradStats {
using GradType = double;
@@ -301,8 +330,8 @@ struct XGBOOST_ALIGNAS(16) GradStats {
GradType sum_hess { 0 };
public:
XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
[[nodiscard]] XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
[[nodiscard]] XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
friend std::ostream& operator<<(std::ostream& os, GradStats s) {
os << s.GetGrad() << "/" << s.GetHess();
@@ -340,7 +369,7 @@ struct XGBOOST_ALIGNAS(16) GradStats {
sum_hess = a.sum_hess - b.sum_hess;
}
/*! \return whether the statistics is not used yet */
inline bool Empty() const { return sum_hess == 0.0; }
[[nodiscard]] bool Empty() const { return sum_hess == 0.0; }
/*! \brief add statistics to the data */
inline void Add(GradType grad, GradType hess) {
sum_grad += grad;
@@ -348,6 +377,19 @@ struct XGBOOST_ALIGNAS(16) GradStats {
}
};
// Helper functions for copying gradient statistic, one for vector leaf, another for normal scalar.
template <typename T, typename U>
std::vector<T> &CopyStats(linalg::VectorView<U> const &src, std::vector<T> *dst) { // NOLINT
dst->resize(src.Size());
std::copy(linalg::cbegin(src), linalg::cend(src), dst->begin());
return *dst;
}
inline GradStats &CopyStats(GradStats const &src, GradStats *dst) { // NOLINT
*dst = src;
return *dst;
}
/*!
* \brief statistics that is helpful to store
* and represent a split solution for the tree
@@ -378,9 +420,9 @@ struct SplitEntryContainer {
return os;
}
/*!\return feature index to split on */
bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
[[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
/*!\return whether missing value goes to left branch */
bool DefaultLeft() const { return (sindex >> 31) != 0; }
[[nodiscard]] bool DefaultLeft() const { return (sindex >> 31) != 0; }
/*!
* \brief decides whether we can replace current entry with the given statistics
*
@@ -391,10 +433,10 @@ struct SplitEntryContainer {
* \param new_loss_chg the loss reduction get through the split
* \param split_index the feature index where the split is on
*/
bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
[[nodiscard]] bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
if (std::isinf(new_loss_chg)) { // in some cases new_loss_chg can be NaN or Inf,
// for example when lambda = 0 & min_child_weight = 0
// skip value in this case
// for example when lambda = 0 & min_child_weight = 0
// skip value in this case
return false;
} else if (this->SplitIndex() <= split_index) {
return new_loss_chg > this->loss_chg;
@@ -429,9 +471,10 @@ struct SplitEntryContainer {
* \param default_left whether the missing value goes to left
* \return whether the proposed split is better and can replace current split
*/
bool Update(bst_float new_loss_chg, unsigned split_index,
bst_float new_split_value, bool default_left, bool is_cat,
const GradientT &left_sum, const GradientT &right_sum) {
template <typename GradientSumT>
bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
bool default_left, bool is_cat, GradientSumT const &left_sum,
GradientSumT const &right_sum) {
if (this->NeedReplace(new_loss_chg, split_index)) {
this->loss_chg = new_loss_chg;
if (default_left) {
@@ -440,8 +483,8 @@ struct SplitEntryContainer {
this->sindex = split_index;
this->split_value = new_split_value;
this->is_cat = is_cat;
this->left_sum = left_sum;
this->right_sum = right_sum;
CopyStats(left_sum, &this->left_sum);
CopyStats(right_sum, &this->right_sum);
return true;
} else {
return false;

View File

@@ -815,9 +815,9 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
linalg::VectorView<float const> left_weight,
linalg::VectorView<float const> right_weight) {
CHECK(IsMultiTarget());
CHECK_LT(split_index, this->param.num_feature);
CHECK_LT(split_index, this->param_.num_feature);
CHECK(this->p_mt_tree_);
CHECK_GT(param.size_leaf_vector, 1);
CHECK_GT(param_.size_leaf_vector, 1);
this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight,
right_weight);
@@ -826,7 +826,7 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
split_categories_segments_.resize(this->Size());
this->split_types_.at(nidx) = FeatureType::kNumerical;
this->param.num_nodes = this->p_mt_tree_->Size();
this->param_.num_nodes = this->p_mt_tree_->Size();
}
void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
@@ -850,13 +850,13 @@ void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
}
void RegTree::Load(dmlc::Stream* fi) {
CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
CHECK_EQ(fi->Read(&param_, sizeof(TreeParam)), sizeof(TreeParam));
if (!DMLC_IO_NO_ENDIAN_SWAP) {
param = param.ByteSwap();
param_ = param_.ByteSwap();
}
nodes_.resize(param.num_nodes);
stats_.resize(param.num_nodes);
CHECK_NE(param.num_nodes, 0);
nodes_.resize(param_.num_nodes);
stats_.resize(param_.num_nodes);
CHECK_NE(param_.num_nodes, 0);
CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
sizeof(Node) * nodes_.size());
if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -873,29 +873,31 @@ void RegTree::Load(dmlc::Stream* fi) {
}
// chg deleted nodes
deleted_nodes_.resize(0);
for (int i = 1; i < param.num_nodes; ++i) {
for (int i = 1; i < param_.num_nodes; ++i) {
if (nodes_[i].IsDeleted()) {
deleted_nodes_.push_back(i);
}
}
CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param.num_deleted);
CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param_.num_deleted);
split_types_.resize(param.num_nodes, FeatureType::kNumerical);
split_categories_segments_.resize(param.num_nodes);
split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
split_categories_segments_.resize(param_.num_nodes);
}
void RegTree::Save(dmlc::Stream* fo) const {
CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
CHECK_EQ(param.deprecated_num_roots, 1);
CHECK_NE(param.num_nodes, 0);
CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
CHECK_EQ(param_.deprecated_num_roots, 1);
CHECK_NE(param_.num_nodes, 0);
CHECK(!IsMultiTarget())
<< "Please use JSON/UBJSON for saving models with multi-target trees.";
CHECK(!HasCategoricalSplit())
<< "Please use JSON/UBJSON for saving models with categorical splits.";
if (DMLC_IO_NO_ENDIAN_SWAP) {
fo->Write(&param, sizeof(TreeParam));
fo->Write(&param_, sizeof(TreeParam));
} else {
TreeParam x = param.ByteSwap();
TreeParam x = param_.ByteSwap();
fo->Write(&x, sizeof(x));
}
@@ -1081,7 +1083,7 @@ void RegTree::LoadModel(Json const& in) {
bool typed = IsA<I32Array>(in[tf::kParent]);
auto const& in_obj = get<Object const>(in);
// basic properties
FromJson(in["tree_param"], &param);
FromJson(in["tree_param"], &param_);
// categorical splits
bool has_cat = in_obj.find("split_type") != in_obj.cend();
if (has_cat) {
@@ -1092,55 +1094,55 @@ void RegTree::LoadModel(Json const& in) {
}
}
// multi-target
if (param.size_leaf_vector > 1) {
this->p_mt_tree_.reset(new MultiTargetTree{&param});
if (param_.size_leaf_vector > 1) {
this->p_mt_tree_.reset(new MultiTargetTree{&param_});
this->GetMultiTargetTree()->LoadModel(in);
return;
}
bool feature_is_64 = IsA<I64Array>(in["split_indices"]);
if (typed && feature_is_64) {
LoadModelImpl<true, true>(in, param, &stats_, &nodes_);
LoadModelImpl<true, true>(in, param_, &stats_, &nodes_);
} else if (typed && !feature_is_64) {
LoadModelImpl<true, false>(in, param, &stats_, &nodes_);
LoadModelImpl<true, false>(in, param_, &stats_, &nodes_);
} else if (!typed && feature_is_64) {
LoadModelImpl<false, true>(in, param, &stats_, &nodes_);
LoadModelImpl<false, true>(in, param_, &stats_, &nodes_);
} else {
LoadModelImpl<false, false>(in, param, &stats_, &nodes_);
LoadModelImpl<false, false>(in, param_, &stats_, &nodes_);
}
if (!has_cat) {
this->split_categories_segments_.resize(this->param.num_nodes);
this->split_types_.resize(this->param.num_nodes);
this->split_categories_segments_.resize(this->param_.num_nodes);
this->split_types_.resize(this->param_.num_nodes);
std::fill(split_types_.begin(), split_types_.end(), FeatureType::kNumerical);
}
deleted_nodes_.clear();
for (bst_node_t i = 1; i < param.num_nodes; ++i) {
for (bst_node_t i = 1; i < param_.num_nodes; ++i) {
if (nodes_[i].IsDeleted()) {
deleted_nodes_.push_back(i);
}
}
// easier access to [] operator
auto& self = *this;
for (auto nid = 1; nid < param.num_nodes; ++nid) {
for (auto nid = 1; nid < param_.num_nodes; ++nid) {
auto parent = self[nid].Parent();
CHECK_NE(parent, RegTree::kInvalidNodeId);
self[nid].SetParent(self[nid].Parent(), self[parent].LeftChild() == nid);
}
CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param.num_deleted);
CHECK_EQ(this->split_categories_segments_.size(), param.num_nodes);
CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param_.num_deleted);
CHECK_EQ(this->split_categories_segments_.size(), param_.num_nodes);
}
void RegTree::SaveModel(Json* p_out) const {
auto& out = *p_out;
// basic properties
out["tree_param"] = ToJson(param);
out["tree_param"] = ToJson(param_);
// categorical splits
this->SaveCategoricalSplit(p_out);
// multi-target
if (this->IsMultiTarget()) {
CHECK_GT(param.size_leaf_vector, 1);
CHECK_GT(param_.size_leaf_vector, 1);
this->GetMultiTargetTree()->SaveModel(p_out);
return;
}
@@ -1150,11 +1152,11 @@ void RegTree::SaveModel(Json* p_out) const {
* pruner, and this pruner can be used inside another updater so leaf are not necessary
* at the end of node array.
*/
CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes));
auto n_nodes = param.num_nodes;
CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param_.num_nodes));
auto n_nodes = param_.num_nodes;
// stats
F32Array loss_changes(n_nodes);
@@ -1168,7 +1170,7 @@ void RegTree::SaveModel(Json* p_out) const {
F32Array conds(n_nodes);
U8Array default_left(n_nodes);
CHECK_EQ(this->split_types_.size(), param.num_nodes);
CHECK_EQ(this->split_types_.size(), param_.num_nodes);
namespace tf = tree_field;
@@ -1189,7 +1191,7 @@ void RegTree::SaveModel(Json* p_out) const {
default_left.Set(i, static_cast<uint8_t>(!!n.DefaultLeft()));
}
};
if (this->param.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
if (this->param_.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
I64Array indices_64(n_nodes);
save_tree(&indices_64);
out[tf::kSplitIdx] = std::move(indices_64);

View File

@@ -226,8 +226,8 @@ class GloablApproxBuilder {
for (auto const &candidate : valid_candidates) {
int left_child_nidx = tree[candidate.nid].LeftChild();
int right_child_nidx = tree[candidate.nid].RightChild();
CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx), {}};
CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx), {}};
CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
best_splits.push_back(l_best);
best_splits.push_back(r_best);
}

View File

@@ -190,7 +190,7 @@ class ColMaker: public TreeUpdater {
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
}
// remember auxiliary statistics in the tree node
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
for (int nid = 0; nid < p_tree->NumNodes(); ++nid) {
p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
p_tree->Stat(nid).base_weight = snode_[nid].weight;
p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
@@ -255,9 +255,9 @@ class ColMaker: public TreeUpdater {
{
// setup statistics space for each tree node
for (auto& i : stemp_) {
i.resize(tree.param.num_nodes, ThreadEntry());
i.resize(tree.NumNodes(), ThreadEntry());
}
snode_.resize(tree.param.num_nodes, NodeEntry());
snode_.resize(tree.NumNodes(), NodeEntry());
}
const MetaInfo& info = fmat.Info();
// setup position

View File

@@ -72,7 +72,7 @@ class TreePruner : public TreeUpdater {
void DoPrune(TrainParam const* param, RegTree* p_tree) {
auto& tree = *p_tree;
bst_node_t npruned = 0;
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
for (int nid = 0; nid < tree.NumNodes(); ++nid) {
if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
}

View File

@@ -4,263 +4,368 @@
* \brief use quantized feature values to construct a tree
* \author Philip Cho, Tianqi Checn, Egor Smirnov
*/
#include "./updater_quantile_hist.h"
#include <algorithm> // for max, copy, transform
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t, int32_t
#include <memory> // for unique_ptr, allocator, make_unique, shared_ptr
#include <numeric> // for accumulate
#include <ostream> // for basic_ostream, char_traits, operator<<
#include <utility> // for move, swap
#include <vector> // for vector
#include <algorithm>
#include <cstddef>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../collective/communicator.h" // for Operation
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
#include "../common/linalg_op.h" // for begin, cbegin, cend
#include "../common/random.h" // for ColumnSampler
#include "../common/threading_utils.h" // for ParallelFor
#include "../common/timer.h" // for Monitor
#include "../common/transform_iterator.h" // for IndexTransformIter, MakeIndexTransformIter
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "common_row_partitioner.h" // for CommonRowPartitioner
#include "dmlc/omp.h" // for omp_get_thread_num
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
#include "driver.h" // for Driver
#include "hist/evaluate_splits.h" // for HistEvaluator, HistMultiEvaluator, UpdatePre...
#include "hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "hist/histogram.h" // for HistogramBuilder, ConstructHistSpace
#include "hist/sampler.h" // for SampleGradient
#include "param.h" // for TrainParam, SplitEntryContainer, GradStats
#include "xgboost/base.h" // for GradientPairInternal, GradientPair, bst_targ...
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for BatchIterator, BatchSet, DMatrix, MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for All, MatrixView, TensorView, Matrix, Empty
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
#include "xgboost/span.h" // for Span, operator!=, SpanIterator
#include "xgboost/string_view.h" // for operator<<
#include "xgboost/task.h" // for ObjInfo
#include "xgboost/tree_model.h" // for RegTree, MTNotImplemented, RTreeNodeStat
#include "xgboost/tree_updater.h" // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
#include "common_row_partitioner.h"
#include "constraints.h"
#include "hist/evaluate_splits.h"
#include "hist/histogram.h"
#include "hist/sampler.h"
#include "param.h"
#include "xgboost/linalg.h"
#include "xgboost/logging.h"
#include "xgboost/tree_updater.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
DMatrix *dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) {
// build tree
const size_t n_trees = trees.size();
if (!pimpl_) {
pimpl_.reset(new Builder(n_trees, param, dmat, *task_, ctx_));
}
BatchParam HistBatch(TrainParam const *param) { return {param->max_bin, param->sparse_threshold}; }
size_t t_idx{0};
for (auto p_tree : trees) {
auto &t_row_position = out_position[t_idx];
this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
++t_idx;
}
}
bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
linalg::VectorView<float> out_preds) {
if (pimpl_) {
return pimpl_->UpdatePredictionCache(data, out_preds);
} else {
return false;
}
}
CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0), 0.0f);
size_t page_id = 0;
auto space = ConstructHistSpace(partitioner_, {node});
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
std::vector<CPUExpandEntry> nodes_to_build{node};
std::vector<CPUExpandEntry> nodes_to_sub;
this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair_h);
++page_id;
}
{
GradientPairPrecise grad_stat;
if (p_fmat->IsDense()) {
/**
* Specialized code for dense data: For dense data (with no missing value), the sum
* of gradient histogram is equal to snode[nid]
*/
auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
CHECK_GE(row_ptr.size(), 2);
uint32_t const ibegin = row_ptr[0];
uint32_t const iend = row_ptr[1];
auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
auto begin = hist.data();
for (uint32_t i = ibegin; i < iend; ++i) {
GradientPairPrecise const &et = begin[i];
grad_stat.Add(et.GetGrad(), et.GetHess());
}
} else {
for (auto const &grad : gpair_h) {
grad_stat.Add(grad.GetGrad(), grad.GetHess());
}
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
}
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
p_tree->Stat(RegTree::kRoot).base_weight = weight;
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
std::vector<CPUExpandEntry> entries{node};
monitor_->Start("EvaluateSplits");
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree, &entries);
break;
}
monitor_->Stop("EvaluateSplits");
node = entries.front();
}
return node;
}
void QuantileHistMaker::Builder::BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
std::vector<CPUExpandEntry> const &valid_candidates,
std::vector<GradientPair> const &gpair) {
std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
size_t n_idx = 0;
for (auto const &c : valid_candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
n_idx++;
}
size_t page_id{0};
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair);
++page_id;
}
}
void QuantileHistMaker::Builder::LeafPartition(RegTree const &tree,
common::Span<GradientPair const> gpair,
std::vector<bst_node_t> *p_out_position) {
template <typename ExpandEntry, typename Updater>
void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const> gpair,
Updater *updater, DMatrix *p_fmat, TrainParam const *param,
HostDeviceVector<bst_node_t> *p_out_position, RegTree *p_tree) {
monitor_->Start(__func__);
if (!task_.UpdateTreeLeaf()) {
return;
}
for (auto const &part : partitioner_) {
part.LeafPartition(ctx_, tree, gpair, p_out_position);
}
monitor_->Stop(__func__);
}
updater->InitData(p_fmat, p_tree);
void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
const std::vector<GradientPair> &gpair_h,
HostDeviceVector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
Driver<CPUExpandEntry> driver(*param_);
driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
Driver<ExpandEntry> driver{*param};
auto const &tree = *p_tree;
driver.Push(updater->InitRoot(p_fmat, gpair, p_tree));
auto expand_set = driver.Pop();
/**
* Note for update position
* Root:
* Not applied: No need to update position as initialization has got all the rows ordered.
* Applied: Update position is run on applied nodes so the rows are partitioned.
* Non-root:
* Not applied: That node is root of the subtree, same rule as root.
* Applied: Ditto
*/
while (!expand_set.empty()) {
// candidates that can be further splited.
std::vector<CPUExpandEntry> valid_candidates;
std::vector<ExpandEntry> valid_candidates;
// candidaates that can be applied.
std::vector<CPUExpandEntry> applied;
int32_t depth = expand_set.front().depth + 1;
for (auto const& candidate : expand_set) {
evaluator_->ApplyTreeSplit(candidate, p_tree);
std::vector<ExpandEntry> applied;
for (auto const &candidate : expand_set) {
updater->ApplyTreeSplit(candidate, p_tree);
CHECK_GT(p_tree->LeftChild(candidate.nid), candidate.nid);
applied.push_back(candidate);
if (driver.IsChildValid(candidate)) {
valid_candidates.emplace_back(candidate);
}
}
monitor_->Start("UpdatePosition");
size_t page_id{0};
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
++page_id;
}
monitor_->Stop("UpdatePosition");
updater->UpdatePosition(p_fmat, p_tree, applied);
std::vector<CPUExpandEntry> best_splits;
std::vector<ExpandEntry> best_splits;
if (!valid_candidates.empty()) {
this->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair_h);
updater->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair);
for (auto const &candidate : valid_candidates) {
int left_child_nidx = tree[candidate.nid].LeftChild();
int right_child_nidx = tree[candidate.nid].RightChild();
CPUExpandEntry l_best{left_child_nidx, depth, 0.0};
CPUExpandEntry r_best{right_child_nidx, depth, 0.0};
auto left_child_nidx = tree.LeftChild(candidate.nid);
auto right_child_nidx = tree.RightChild(candidate.nid);
ExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
ExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
best_splits.push_back(l_best);
best_splits.push_back(r_best);
}
auto const &histograms = histogram_builder_->Histogram();
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, &best_splits);
break;
}
updater->EvaluateSplits(p_fmat, p_tree, &best_splits);
}
driver.Push(best_splits.begin(), best_splits.end());
expand_set = driver.Pop();
}
auto &h_out_position = p_out_position->HostVector();
this->LeafPartition(tree, gpair_h, &h_out_position);
updater->LeafPartition(tree, gpair, &h_out_position);
monitor_->Stop(__func__);
}
void QuantileHistMaker::Builder::UpdateTree(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
RegTree *p_tree,
HostDeviceVector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
/**
* \brief Updater for building multi-target trees. The implementation simply iterates over
* each target.
*/
class MultiTargetHistBuilder {
private:
common::Monitor *monitor_{nullptr};
TrainParam const *param_{nullptr};
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistMultiEvaluator> evaluator_;
// Histogram builder for each target.
std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
Context const *ctx_{nullptr};
// Partitioner for each data batch.
std::vector<CommonRowPartitioner> partitioner_;
// Pointer to last updated tree, used for update prediction cache.
RegTree const *p_last_tree_{nullptr};
std::vector<GradientPair> *gpair_ptr = &(gpair->HostVector());
// in case 'num_parallel_trees != 1' no posibility to change initial gpair
if (GetNumberOfTrees() != 1) {
gpair_local_.resize(gpair_ptr->size());
gpair_local_ = *gpair_ptr;
gpair_ptr = &gpair_local_;
ObjInfo const *task_{nullptr};
public:
void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<MultiExpandEntry> const &applied) {
monitor_->Start(__func__);
std::size_t page_id{0};
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
page_id++;
}
monitor_->Stop(__func__);
}
this->InitData(p_fmat, *p_tree, gpair_ptr);
ExpandTree(p_fmat, p_tree, *gpair_ptr, p_out_position);
monitor_->Stop(__func__);
}
bool QuantileHistMaker::Builder::UpdatePredictionCache(DMatrix const *data,
linalg::VectorView<float> out_preds) const {
// p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
// conjunction with Update().
if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
return false;
void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
this->evaluator_->ApplyTreeSplit(candidate, p_tree);
}
monitor_->Start(__func__);
CHECK_EQ(out_preds.Size(), data->Info().num_row_);
UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
monitor_->Stop(__func__);
return true;
}
size_t QuantileHistMaker::Builder::GetNumberOfTrees() { return n_trees_; }
void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
monitor_->Start(__func__);
void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
std::vector<GradientPair> *gpair) {
monitor_->Start(__func__);
const auto& info = fmat->Info();
std::size_t page_id = 0;
bst_bin_t n_total_bins = 0;
partitioner_.clear();
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
if (n_total_bins == 0) {
n_total_bins = page.cut.TotalBins();
} else {
CHECK_EQ(n_total_bins, page.cut.TotalBins());
}
partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
page_id++;
}
{
size_t page_id{0};
int32_t n_total_bins{0};
bst_target_t n_targets = p_tree->NumTargets();
histogram_builder_.clear();
for (std::size_t i = 0; i < n_targets; ++i) {
histogram_builder_.emplace_back();
histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), p_fmat->IsColumnSplit());
}
evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
p_last_tree_ = p_tree;
monitor_->Stop(__func__);
}
MultiExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
RegTree *p_tree) {
monitor_->Start(__func__);
MultiExpandEntry best;
best.nid = RegTree::kRoot;
best.depth = 0;
auto n_targets = p_tree->NumTargets();
linalg::Matrix<GradientPairPrecise> root_sum_tloc =
linalg::Empty<GradientPairPrecise>(ctx_, ctx_->Threads(), n_targets);
CHECK_EQ(root_sum_tloc.Shape(1), gpair.Shape(1));
auto h_root_sum_tloc = root_sum_tloc.HostView();
common::ParallelFor(gpair.Shape(0), ctx_->Threads(), [&](auto i) {
for (bst_target_t t{0}; t < n_targets; ++t) {
h_root_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)};
}
});
// Aggregate to the first row.
auto root_sum = h_root_sum_tloc.Slice(0, linalg::All());
for (std::int32_t tidx{1}; tidx < ctx_->Threads(); ++tidx) {
for (bst_target_t t{0}; t < n_targets; ++t) {
root_sum(t) += h_root_sum_tloc(tidx, t);
}
}
CHECK(root_sum.CContiguous());
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
std::vector<MultiExpandEntry> nodes{best};
std::size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
for (bst_target_t t{0}; t < n_targets; ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes, {}, t_gpair.Values());
}
i++;
}
auto weight = evaluator_->InitRoot(root_sum);
auto weight_t = weight.HostView();
std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
[&](float w) { return w * param_->learning_rate; });
p_tree->SetLeaf(RegTree::kRoot, weight_t);
std::vector<common::HistCollection const *> hists;
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
hists.push_back(&histogram_builder_[t].Histogram());
}
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
break;
}
monitor_->Stop(__func__);
return nodes.front();
}
void BuildHistogram(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<MultiExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
std::vector<MultiExpandEntry> nodes_to_build;
std::vector<MultiExpandEntry> nodes_to_sub;
for (auto const &c : valid_candidates) {
auto left_nidx = p_tree->LeftChild(c.nid);
auto right_nidx = p_tree->RightChild(c.nid);
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
auto lit =
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
auto rit =
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
auto fewer_right = right_sum < left_sum;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
}
std::size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
// Make sure the gradient matrix is f-order.
CHECK(t_gpair.Contiguous());
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes_to_build, nodes_to_sub, t_gpair.Values());
}
i++;
}
monitor_->Stop(__func__);
}
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<MultiExpandEntry> *best_splits) {
monitor_->Start(__func__);
std::vector<common::HistCollection const *> hists;
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
hists.push_back(&histogram_builder_[t].Histogram());
}
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
break;
}
monitor_->Stop(__func__);
}
void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
std::vector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
if (!task_->UpdateTreeLeaf()) {
return;
}
for (auto const &part : partitioner_) {
part.LeafPartition(ctx_, tree, gpair, p_out_position);
}
monitor_->Stop(__func__);
}
public:
explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
std::shared_ptr<common::ColumnSampler> column_sampler,
ObjInfo const *task, common::Monitor *monitor)
: monitor_{monitor},
param_{param},
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
ctx_{ctx},
task_{task} {
monitor_->Init(__func__);
}
};
class HistBuilder {
private:
common::Monitor *monitor_;
TrainParam const *param_;
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
std::vector<CommonRowPartitioner> partitioner_;
// back pointers to tree and data matrix
const RegTree *p_last_tree_{nullptr};
DMatrix const *const p_last_fmat_{nullptr};
std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
ObjInfo const *task_{nullptr};
// Context for number of threads
Context const *ctx_{nullptr};
public:
explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
common::Monitor *monitor)
: monitor_{monitor},
param_{param},
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
col_sampler_)},
p_last_fmat_(fmat),
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
task_{task},
ctx_{ctx} {
monitor_->Init(__func__);
}
bool UpdatePredictionCache(DMatrix const *data, linalg::VectorView<float> out_preds) const {
// p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
// conjunction with Update().
if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
return false;
}
monitor_->Start(__func__);
CHECK_EQ(out_preds.Size(), data->Info().num_row_);
UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
monitor_->Stop(__func__);
return true;
}
public:
// initialize temp data structure
void InitData(DMatrix *fmat, RegTree const *p_tree) {
monitor_->Start(__func__);
std::size_t page_id{0};
bst_bin_t n_total_bins{0};
partitioner_.clear();
for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
if (n_total_bins == 0) {
@@ -273,22 +378,227 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), fmat->IsColumnSplit());
auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
SampleGradient(ctx_, *param_, m_gpair);
evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
col_sampler_);
p_last_tree_ = p_tree;
}
// store a pointer to the tree
p_last_tree_ = &tree;
evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<CPUExpandEntry> *best_splits) {
monitor_->Start(__func__);
auto const &histograms = histogram_builder_->Histogram();
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
break;
}
monitor_->Stop(__func__);
}
monitor_->Stop(__func__);
}
void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
this->evaluator_->ApplyTreeSplit(candidate, p_tree);
}
CPUExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
RegTree *p_tree) {
CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
std::size_t page_id = 0;
auto space = ConstructHistSpace(partitioner_, {node});
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
std::vector<CPUExpandEntry> nodes_to_build{node};
std::vector<CPUExpandEntry> nodes_to_sub;
this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
++page_id;
}
{
GradientPairPrecise grad_stat;
if (p_fmat->IsDense()) {
/**
* Specialized code for dense data: For dense data (with no missing value), the sum
* of gradient histogram is equal to snode[nid]
*/
auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
CHECK_GE(row_ptr.size(), 2);
std::uint32_t const ibegin = row_ptr[0];
std::uint32_t const iend = row_ptr[1];
auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
auto begin = hist.data();
for (std::uint32_t i = ibegin; i < iend; ++i) {
GradientPairPrecise const &et = begin[i];
grad_stat.Add(et.GetGrad(), et.GetHess());
}
} else {
auto gpair_h = gpair.Slice(linalg::All(), 0).Values();
for (auto const &grad : gpair_h) {
grad_stat.Add(grad.GetGrad(), grad.GetHess());
}
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
2);
}
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
p_tree->Stat(RegTree::kRoot).base_weight = weight;
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
std::vector<CPUExpandEntry> entries{node};
monitor_->Start("EvaluateSplits");
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
&entries);
break;
}
monitor_->Stop("EvaluateSplits");
node = entries.front();
}
return node;
}
void BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
std::vector<CPUExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
std::size_t n_idx = 0;
for (auto const &c : valid_candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
n_idx++;
}
std::size_t page_id{0};
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair.Values());
++page_id;
}
}
void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<CPUExpandEntry> const &applied) {
monitor_->Start(__func__);
std::size_t page_id{0};
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
page_id++;
}
monitor_->Stop(__func__);
}
void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
std::vector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
if (!task_->UpdateTreeLeaf()) {
return;
}
for (auto const &part : partitioner_) {
part.LeafPartition(ctx_, tree, gpair, p_out_position);
}
monitor_->Stop(__func__);
}
};
/*! \brief construct a tree using quantized feature values */
class QuantileHistMaker : public TreeUpdater {
std::unique_ptr<HistBuilder> p_impl_{nullptr};
std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
std::shared_ptr<common::ColumnSampler> column_sampler_ =
std::make_shared<common::ColumnSampler>();
common::Monitor monitor_;
ObjInfo const *task_{nullptr};
public:
explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
: TreeUpdater{ctx}, task_{task} {}
void Configure(const Args &) override {}
void LoadConfig(Json const &) override {}
void SaveConfig(Json *) const override {}
[[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) override {
if (trees.front()->IsMultiTarget()) {
CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
if (!p_mtimpl_) {
this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
}
} else {
if (!p_impl_) {
p_impl_ =
std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
}
}
bst_target_t n_targets = trees.front()->NumTargets();
auto h_gpair =
linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
linalg::Matrix<GradientPair> sample_out;
auto h_sample_out = h_gpair;
auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
if (need_copy()) {
// allocate buffer
sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
h_sample_out = sample_out.HostView();
}
for (auto tree_it = trees.begin(); tree_it != trees.end(); ++tree_it) {
if (need_copy()) {
// Copy gradient into buffer for sampling. This converts C-order to F-order.
std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
}
SampleGradient(ctx_, *param, h_sample_out);
auto *h_out_position = &out_position[tree_it - trees.begin()];
if ((*tree_it)->IsMultiTarget()) {
UpdateTree<MultiExpandEntry>(&monitor_, h_sample_out, p_mtimpl_.get(), p_fmat, param,
h_out_position, *tree_it);
} else {
UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
h_out_position, *tree_it);
}
}
}
bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
if (p_impl_) {
return p_impl_->UpdatePredictionCache(data, out_preds);
} else if (p_mtimpl_) {
// Not yet supported.
return false;
} else {
return false;
}
}
[[nodiscard]] bool HasNodePosition() const override { return true; }
};
XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
.describe("Grow tree using quantized histogram.")
.set_body([](Context const *ctx, ObjInfo const *task) {
return new QuantileHistMaker(ctx, task);
return new QuantileHistMaker{ctx, task};
});
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -1,133 +0,0 @@
/*!
* Copyright 2017-2022 by XGBoost Contributors
* \file updater_quantile_hist.h
* \brief use quantized feature values to construct a tree
* \author Philip Cho, Tianqi Chen, Egor Smirnov
*/
#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
#include <xgboost/tree_updater.h>
#include <algorithm>
#include <limits>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/json.h"
#include "hist/evaluate_splits.h"
#include "hist/histogram.h"
#include "hist/expand_entry.h"
#include "common_row_partitioner.h"
#include "constraints.h"
#include "./param.h"
#include "./driver.h"
#include "../common/random.h"
#include "../common/timer.h"
#include "../common/hist_util.h"
#include "../common/row_set.h"
#include "../common/partition_builder.h"
#include "../common/column_matrix.h"
namespace xgboost::tree {
inline BatchParam HistBatch(TrainParam const* param) {
return {param->max_bin, param->sparse_threshold};
}
/*! \brief construct a tree using quantized feature values */
class QuantileHistMaker: public TreeUpdater {
public:
explicit QuantileHistMaker(Context const* ctx, ObjInfo const* task)
: TreeUpdater(ctx), task_{task} {}
void Configure(const Args&) override {}
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override;
bool UpdatePredictionCache(const DMatrix *data,
linalg::VectorView<float> out_preds) override;
void LoadConfig(Json const&) override {}
void SaveConfig(Json*) const override {}
[[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
[[nodiscard]] bool HasNodePosition() const override { return true; }
protected:
// actual builder that runs the algorithm
struct Builder {
public:
// constructor
explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
ObjInfo task, Context const* ctx)
: n_trees_(n_trees),
param_(param),
p_last_fmat_(fmat),
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
task_{task},
ctx_{ctx},
monitor_{std::make_unique<common::Monitor>()} {
monitor_->Init("Quantile::Builder");
}
// update one tree, growing
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
HostDeviceVector<bst_node_t>* p_out_position);
bool UpdatePredictionCache(DMatrix const* data, linalg::VectorView<float> out_preds) const;
private:
// initialize temp data structure
void InitData(DMatrix* fmat, const RegTree& tree, std::vector<GradientPair>* gpair);
size_t GetNumberOfTrees();
CPUExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree,
const std::vector<GradientPair>& gpair_h);
void BuildHistogram(DMatrix* p_fmat, RegTree* p_tree,
std::vector<CPUExpandEntry> const& valid_candidates,
std::vector<GradientPair> const& gpair);
void LeafPartition(RegTree const& tree, common::Span<GradientPair const> gpair,
std::vector<bst_node_t>* p_out_position);
void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h,
HostDeviceVector<bst_node_t>* p_out_position);
private:
const size_t n_trees_;
TrainParam const* param_;
std::shared_ptr<common::ColumnSampler> column_sampler_{
std::make_shared<common::ColumnSampler>()};
std::vector<GradientPair> gpair_local_;
std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
std::vector<CommonRowPartitioner> partitioner_;
// back pointers to tree and data matrix
const RegTree* p_last_tree_{nullptr};
DMatrix const* const p_last_fmat_;
std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
ObjInfo task_;
// Context for number of threads
Context const* ctx_;
std::unique_ptr<common::Monitor> monitor_;
};
protected:
std::unique_ptr<Builder> pimpl_;
ObjInfo const* task_;
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_

View File

@@ -50,11 +50,11 @@ class TreeRefresher : public TreeUpdater {
int tid = omp_get_thread_num();
int num_nodes = 0;
for (auto tree : trees) {
num_nodes += tree->param.num_nodes;
num_nodes += tree->NumNodes();
}
stemp[tid].resize(num_nodes, GradStats());
std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
fvec_temp[tid].Init(trees[0]->param.num_feature);
fvec_temp[tid].Init(trees[0]->NumFeatures());
});
}
exc.Rethrow();
@@ -77,7 +77,7 @@ class TreeRefresher : public TreeUpdater {
for (auto tree : trees) {
AddStats(*tree, feats, gpair_h, info, ridx,
dmlc::BeginPtr(stemp[tid]) + offset);
offset += tree->param.num_nodes;
offset += tree->NumNodes();
}
feats.Drop(inst);
});
@@ -96,7 +96,7 @@ class TreeRefresher : public TreeUpdater {
int offset = 0;
for (auto tree : trees) {
this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
offset += tree->param.num_nodes;
offset += tree->NumNodes();
}
}