Merge branch 'master' into dev-hui

This commit is contained in:
amdsc21
2023-03-08 00:39:33 +01:00
221 changed files with 3122 additions and 1486 deletions

View File

@@ -455,7 +455,8 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
xgboost_CHECK_C_ARG_PTR(indptr);
xgboost_CHECK_C_ARG_PTR(indices);
xgboost_CHECK_C_ARG_PTR(data);
data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow};
data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data},
static_cast<std::size_t>(nrow)};
xgboost_CHECK_C_ARG_PTR(c_json_config);
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2022 XGBoost contributors
/**
* Copyright 2022-2023 by XGBoost contributors
*/
#pragma once
#include <string>
@@ -9,7 +9,7 @@
namespace xgboost {
namespace collective {
/*!
/**
* \brief Initialize the collective communicator.
*
* Currently the communicator API is experimental, function signatures may change in the future
@@ -140,6 +140,19 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
}
}
/**
* @brief Gathers data from all processes and distributes it to all processes.
*
* This assumes all ranks have the same size, and input data has been sliced into the
* corresponding position.
*
* @param send_receive_buffer Buffer storing the data.
* @param size Size of the data in bytes.
*/
inline void Allgather(void *send_receive_buffer, std::size_t size) {
Communicator::Get()->AllGather(send_receive_buffer, size);
}
/*!
* \brief Perform in-place allreduce. This function is NOT thread-safe.
*
@@ -197,7 +210,7 @@ inline void Allreduce(uint64_t *send_receive_buffer, size_t count) {
template <Operation op, typename T,
typename = std::enable_if_t<std::is_same<size_t, T>{} && !std::is_same<uint64_t, T>{}> >
inline void Allreduce(T *send_receive_buffer, size_t count) {
static_assert(sizeof(T) == sizeof(uint64_t), "");
static_assert(sizeof(T) == sizeof(uint64_t));
Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
}

View File

@@ -1,10 +1,32 @@
/*!
* Copyright 2022 by XGBoost Contributors
/**
* Copyright 2022-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_ALGORITHM_H_
#define XGBOOST_COMMON_ALGORITHM_H_
#include <algorithm> // std::upper_bound
#include <cinttypes> // std::size_t
#include <algorithm> // upper_bound, stable_sort, sort, max
#include <cinttypes> // size_t
#include <functional> // less
#include <iterator> // iterator_traits, distance
#include <vector> // vector
#include "numeric.h" // Iota
#include "xgboost/context.h" // Context
// clang with libstdc++ works as well
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
!defined(__APPLE__) && __has_include(<omp.h>)
#define GCC_HAS_PARALLEL 1
#endif // GLIC_VERSION
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#define MSVC_HAS_PARALLEL 1
#endif // MSC
#if defined(GCC_HAS_PARALLEL)
#include <parallel/algorithm>
#elif defined(MSVC_HAS_PARALLEL)
#include <ppl.h>
#endif // GLIBC VERSION
namespace xgboost {
namespace common {
@@ -13,6 +35,63 @@ auto SegmentId(It first, It last, Idx idx) {
std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first;
return segment_id;
}
template <typename Iter, typename Comp>
void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) {
if (ctx->Threads() > 1) {
#if defined(GCC_HAS_PARALLEL)
__gnu_parallel::stable_sort(begin, end, comp,
__gnu_parallel::default_parallel_tag(ctx->Threads()));
#else
// the only stable sort is radix sort for msvc ppl.
std::stable_sort(begin, end, comp);
#endif // GLIBC VERSION
} else {
std::stable_sort(begin, end, comp);
}
}
template <typename Iter, typename Comp>
void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
if (ctx->Threads() > 1) {
#if defined(GCC_HAS_PARALLEL)
__gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads()));
#elif defined(MSVC_HAS_PARALLEL)
auto n = std::distance(begin, end);
// use chunk size as hint to number of threads. No local policy/scheduler input with the
// concurrency module.
std::size_t chunk_size = n / ctx->Threads();
// 2048 is the default of msvc ppl as of v2022.
chunk_size = std::max(chunk_size, static_cast<std::size_t>(2048));
concurrency::parallel_sort(begin, end, comp, chunk_size);
#else
std::sort(begin, end, comp);
#endif // GLIBC VERSION
} else {
std::sort(begin, end, comp);
}
}
template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
typename Comp = std::less<V>>
std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
CHECK(ctx->IsCPU());
auto n = std::distance(begin, end);
std::vector<Idx> result(n);
Iota(ctx, result.begin(), result.end(), 0);
auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); };
StableSort(ctx, result.begin(), result.end(), op);
return result;
}
} // namespace common
} // namespace xgboost
#if defined(GCC_HAS_PARALLEL)
#undef GCC_HAS_PARALLEL
#endif // defined(GCC_HAS_PARALLEL)
#if defined(MSVC_HAS_PARALLEL)
#undef MSVC_HAS_PARALLEL
#endif // defined(MSVC_HAS_PARALLEL)
#endif // XGBOOST_COMMON_ALGORITHM_H_

View File

@@ -42,9 +42,9 @@ constexpr inline bst_cat_t OutOfRangeCat() {
inline XGBOOST_DEVICE bool InvalidCat(float cat) {
constexpr auto kMaxCat = OutOfRangeCat();
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat);
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1);
static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat);
return cat < 0 || cat >= kMaxCat;
}

View File

@@ -270,7 +270,9 @@ struct RyuPowLogUtils {
*/
static uint32_t MulPow5InvDivPow2(const uint32_t m, const uint32_t q,
const int32_t j) noexcept(true) {
return MulShift(m, kFloatPow5InvSplit[q], j);
static_assert(sizeof(kFloatPow5InvSplit) == 55 * sizeof(std::uint64_t));
assert(q < 55);
return MulShift(m, kFloatPow5InvSplit[q], j); // NOLINT
}
/*
@@ -495,12 +497,10 @@ class PowerBaseComputer {
static_cast<int32_t>(IEEE754::kFloatBias) -
static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
static_cast<int32_t>(2);
static_assert(static_cast<int32_t>(1) -
static_cast<int32_t>(IEEE754::kFloatBias) -
static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
static_cast<int32_t>(2) ==
-151,
"");
static_assert(static_cast<int32_t>(1) - static_cast<int32_t>(IEEE754::kFloatBias) -
static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
static_cast<int32_t>(2) ==
-151);
mantissa_base2 = f.mantissa;
} else {
base2_range.exponent = static_cast<int32_t>(f.exponent) - IEEE754::kFloatBias -
@@ -544,7 +544,7 @@ class RyuPrinter {
// Function precondition: v is not a 10-digit number.
// (f2s: 9 digits are sufficient for round-tripping.)
// (d2fixed: We print 9-digit blocks.)
static_assert(100000000 == Tens(8), "");
static_assert(100000000 == Tens(8));
assert(v < Tens(9));
if (v >= Tens(8)) {
return 9;
@@ -911,7 +911,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
// the bias and also special-case the value 0.
int32_t shift = (f_e2 == 0 ? 1 : f_e2) - exp_b2 - IEEE754::kFloatBias -
IEEE754::kFloatMantissaBits;
assert(shift >= 0);
assert(shift >= 1);
// We need to round up if the exact value is more than 0.5 above the value we
// computed. That's equivalent to checking if the last removed bit was 1 and
@@ -920,7 +920,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
//
// We need to update trailingZeros given that we have the exact output
// exponent ieee_e2 now.
trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;
trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0; // NOLINT
uint32_t lastRemovedBit = (mantissa_b2 >> (shift - 1)) & 1;
bool roundup = (lastRemovedBit != 0) &&
(!trailing_zeros || (((mantissa_b2 >> shift) & 1) != 0));

View File

@@ -87,7 +87,7 @@ inline to_chars_result to_chars(char *first, char *last, int64_t value) { // NOL
if (value < 0) {
*first = '-';
std::advance(first, 1);
unsigned_value = uint64_t(~value) + uint64_t(1);
unsigned_value = static_cast<uint64_t>(~value) + static_cast<uint64_t>(1);
}
return detail::ToCharsUnsignedImpl(first, last, unsigned_value);
}

View File

@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
feature_offsets_[fid] = accum_index;
}
SetTypeSize(gmat.max_num_bins);
SetTypeSize(gmat.MaxNumBinPerFeat());
auto storage_size =
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
index_.resize(storage_size, 0);

View File

@@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) {
}
#endif
template <typename Idx, typename Container,
typename V = typename Container::value_type,
typename Comp = std::less<V>>
std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
std::vector<Idx> result(array.size());
std::iota(result.begin(), result.end(), 0);
auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
return result;
}
/**
* Last index of a group in a CSR style of index pointer.
*/
@@ -206,31 +195,6 @@ template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
return indptr[group + 1] - 1;
}
/**
* \brief A CRTP (curiously recurring template pattern) helper function.
*
* https://www.fluentcpp.com/2017/05/19/crtp-helper/
*
* Does two things:
* 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
* 2. Avoids having to `static_cast` in a lot of places.
*
* \tparam T The derived class in a CRTP hierarchy.
*/
template <typename T>
struct Crtp {
T &Underlying() { return static_cast<T &>(*this); }
T const &Underlying() const { return static_cast<T const &>(*this); }
};
/**
* \brief C++17 std::as_const
*/
template <typename T>
typename std::add_const<T>::type &AsConst(T &v) noexcept { // NOLINT(runtime/references)
return v;
}
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_COMMON_H_

View File

@@ -1,12 +1,13 @@
/*!
* Copyright 2017 by Contributors
/**
* Copyright 2017-2023 by XGBoost Contributors
* \file compressed_iterator.h
*/
#pragma once
#include <xgboost/base.h>
#include <cmath>
#include <cstddef>
#include <algorithm>
#include <cmath>
#include <cstddef> // for size_t
#include "common.h"
@@ -36,7 +37,7 @@ static const int kPadding = 4; // Assign padding so we can read slightly off
// The number of bits required to represent a given unsigned range
inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) {
auto bits = std::ceil(log2(static_cast<double>(num_symbols)));
return common::Max(static_cast<size_t>(bits), size_t(1));
return common::Max(static_cast<size_t>(bits), static_cast<std::size_t>(1));
}
} // namespace detail

View File

@@ -20,6 +20,7 @@
#include <algorithm>
#include <chrono>
#include <cstddef> // for size_t
#include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
#include <numeric>
@@ -178,7 +179,7 @@ inline size_t MaxSharedMemory(int device_idx) {
dh::safe_cuda(cudaDeviceGetAttribute
(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
device_idx));
return size_t(max_shared_memory);
return static_cast<std::size_t>(max_shared_memory);
}
/**
@@ -195,7 +196,7 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
dh::safe_cuda(cudaDeviceGetAttribute
(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
device_idx));
return size_t(max_shared_memory);
return static_cast<std::size_t>(max_shared_memory);
}
inline void CheckComputeCapability() {

View File

@@ -46,7 +46,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
if (!use_sorted) {
HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
HostSketchContainer::UseGroup(info),
m->Info().data_split_mode == DataSplitMode::kCol, n_threads);
m->IsColumnSplit(), n_threads);
for (auto const& page : m->GetBatches<SparsePage>()) {
container.PushRowPage(page, info, hessian);
}
@@ -54,7 +54,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
} else {
SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
HostSketchContainer::UseGroup(info),
m->Info().data_split_mode == DataSplitMode::kCol, n_threads};
m->IsColumnSplit(), n_threads};
for (auto const& page : m->GetBatches<SortedCSCPage>()) {
container.PushColPage(page, info, hessian);
}

View File

@@ -1,33 +1,31 @@
/*!
* Copyright 2018~2020 XGBoost contributors
/**
* Copyright 2018~2023 by XGBoost contributors
*/
#include <xgboost/logging.h>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <xgboost/logging.h>
#include <cstddef> // for size_t
#include <memory>
#include <mutex>
#include <utility>
#include <vector>
#include "categorical.h"
#include "device_helpers.cuh"
#include "hist_util.h"
#include "hist_util.cuh"
#include "hist_util.h"
#include "math.h" // NOLINT
#include "quantile.h"
#include "categorical.h"
#include "xgboost/host_device_vector.h"
namespace xgboost {
namespace common {
@@ -318,7 +316,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t batch_nnz = batch.data.Size();
auto const& info = dmat->Info();
for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
if (has_weights) {
bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2020 XGBoost contributors
/**
* Copyright 2020-2023 by XGBoost contributors
*
* \brief Front end and utilities for GPU based sketching. Works on sliding window
* instead of stream.
@@ -9,11 +9,13 @@
#include <thrust/host_vector.h>
#include <cstddef> // for size_t
#include "../data/device_adapter.cuh"
#include "device_helpers.cuh"
#include "hist_util.h"
#include "quantile.cuh"
#include "device_helpers.cuh"
#include "timer.h"
#include "../data/device_adapter.cuh"
namespace xgboost {
namespace common {
@@ -304,7 +306,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, true);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
ProcessWeightedSlidingWindow(batch, info,
num_cuts_per_feature,
HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
@@ -316,7 +319,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
sketch_container, num_cuts_per_feature);
}

View File

@@ -50,7 +50,7 @@ size_t PeekableInStream::PeekRead(void* dptr, size_t size) {
}
}
FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream), pointer_{0} {
FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream) {
size_t constexpr kInitialSize = 4096;
size_t size{kInitialSize}, total{0};
buffer_.clear();

View File

@@ -27,8 +27,7 @@ using MemoryBufferStream = rabit::utils::MemoryBufferStream;
*/
class PeekableInStream : public dmlc::Stream {
public:
explicit PeekableInStream(dmlc::Stream* strm)
: strm_(strm), buffer_ptr_(0) {}
explicit PeekableInStream(dmlc::Stream* strm) : strm_(strm) {}
size_t Read(void* dptr, size_t size) override;
virtual size_t PeekRead(void* dptr, size_t size);
@@ -41,7 +40,7 @@ class PeekableInStream : public dmlc::Stream {
/*! \brief input stream */
dmlc::Stream *strm_;
/*! \brief current buffer pointer */
size_t buffer_ptr_;
size_t buffer_ptr_{0};
/*! \brief internal buffer */
std::string buffer_;
};
@@ -72,7 +71,7 @@ class FixedSizeStream : public PeekableInStream {
void Take(std::string* out);
private:
size_t pointer_;
size_t pointer_{0};
std::string buffer_;
};

View File

@@ -710,10 +710,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
writer->Save(json);
}
static_assert(std::is_nothrow_move_constructible<Json>::value, "");
static_assert(std::is_nothrow_move_constructible<Object>::value, "");
static_assert(std::is_nothrow_move_constructible<Array>::value, "");
static_assert(std::is_nothrow_move_constructible<String>::value, "");
static_assert(std::is_nothrow_move_constructible<Json>::value);
static_assert(std::is_nothrow_move_constructible<Object>::value);
static_assert(std::is_nothrow_move_constructible<Array>::value);
static_assert(std::is_nothrow_move_constructible<String>::value);
Json UBJReader::ParseArray() {
auto marker = PeekNextChar();

View File

@@ -14,7 +14,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
if (ctx->IsCPU()) {
auto const& h_values = values.ConstHostVector();
auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
static_assert(std::is_same<decltype(result), double>::value, "");
static_assert(std::is_same<decltype(result), double>::value);
return result;
}
return cuda_impl::Reduce(ctx, values);

View File

@@ -42,8 +42,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
*/
template <typename InIt, typename OutIt, typename T>
void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
// The number of threads is pegged to the batch size. If the OMP block is parallelized
// on anything other than the batch/block size, it should be reassigned
auto n = static_cast<size_t>(std::distance(begin, end));

View File

@@ -31,6 +31,8 @@ namespace common {
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
template<size_t BlockSize>
class PartitionBuilder {
using BitVector = RBitField8;
public:
template<typename Func>
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
@@ -121,27 +123,11 @@ class PartitionBuilder {
bool default_left = tree[nid].DefaultLeft();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& index = gmat.index;
auto const& cut_values = gmat.cut.Values();
auto const& cut_ptrs = gmat.cut.Ptrs();
auto gidx_calc = [&](auto ridx) {
auto begin = gmat.RowIdx(ridx);
if (gmat.IsDense()) {
return static_cast<bst_bin_t>(index[begin + fid]);
}
auto end = gmat.RowIdx(ridx + 1);
auto f_begin = cut_ptrs[fid];
auto f_end = cut_ptrs[fid + 1];
// bypassing the column matrix as we need the cut value instead of bin idx for categorical
// features.
return BinarySearchBin(begin, end, index, f_begin, f_end);
};
auto pred_hist = [&](auto ridx, auto bin_id) {
if (any_cat && is_cat) {
auto gidx = gidx_calc(ridx);
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx]);
@@ -153,7 +139,7 @@ class PartitionBuilder {
};
auto pred_approx = [&](auto ridx) {
auto gidx = gidx_calc(ridx);
auto gidx = gmat.GetGindex(ridx, fid);
bool go_left = default_left;
if (gidx > -1) {
if (is_cat) {
@@ -199,6 +185,84 @@ class PartitionBuilder {
SetNRightElems(node_in_set, range.begin(), n_right);
}
/**
* @brief When data is split by column, we don't have all the features locally on the current
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
if (!column_matrix.IsInitialized()) {
for (auto row_id : rid_span) {
auto gidx = gmat.GetGindex(row_id, fid);
if (gidx > -1) {
bool go_left = false;
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}
if (go_left) {
decision_bits->Set(row_id - gmat.base_rowid);
}
} else {
missing_bits->Set(row_id - gmat.base_rowid);
}
}
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
}
/**
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
* use them to partition the rows accordingly.
*/
void PartitionByMask(const size_t node_in_set,
std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
BitVector const& missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();
auto pred_approx = [&](auto ridx) {
bool go_left = default_left;
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
if (!is_missing) {
go_left = decision_bits.Check(ridx - gmat.base_rowid);
}
return go_left;
};
std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
SetNLeftElems(node_in_set, range.begin(), n_left);
SetNRightElems(node_in_set, range.begin(), n_right);
}
// allocate thread local memory, should be called for each specific task
void AllocateForTask(size_t id) {
if (mem_blocks_[id].get() == nullptr) {

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2020-2022 by XGBoost Contributors
/**
* Copyright 2020-2023 by XGBoost Contributors
*/
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
@@ -109,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
template <typename T, typename U>
void CopyTo(Span<T> out, Span<U> src) {
CHECK_EQ(out.size(), src.size());
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
out.size_bytes(),
cudaMemcpyDefault));
@@ -143,7 +143,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));
dh::XGBCachingDeviceAllocator<Tuple> alloc;
static_assert(sizeof(Tuple) == sizeof(SketchEntry), "");
static_assert(sizeof(Tuple) == sizeof(SketchEntry));
// We reuse the memory for storing merge path.
common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
// Determine the merge path, 0 if element is from x, 1 if it's from y.

View File

@@ -24,8 +24,9 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
for (size_t i = 0; i < h_features.size(); ++i) {
weights[i] = feature_weights_[h_features[i]];
}
CHECK(ctx_);
new_features.HostVector() =
WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n);
WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
} else {
new_features.Resize(features.size());
std::copy(features.begin(), features.end(), new_features.HostVector().begin());

View File

@@ -20,7 +20,9 @@
#include <vector>
#include "../collective/communicator-inl.h"
#include "algorithm.h" // ArgSort
#include "common.h"
#include "xgboost/context.h" // Context
#include "xgboost/host_device_vector.h"
namespace xgboost {
@@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
* https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
*/
template <typename T>
std::vector<T> WeightedSamplingWithoutReplacement(
std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
std::vector<float> const& weights, size_t n) {
// ES sampling.
CHECK_EQ(array.size(), weights.size());
std::vector<float> keys(weights.size());
@@ -100,7 +102,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
auto k = std::log(u) / w;
keys[i] = k;
}
auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
ind.resize(n);
std::vector<T> results(ind.size());
@@ -126,6 +128,7 @@ class ColumnSampler {
float colsample_bytree_{1.0f};
float colsample_bynode_{1.0f};
GlobalRandomEngine rng_;
Context const* ctx_;
public:
std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
@@ -157,12 +160,13 @@ class ColumnSampler {
* \param colsample_bytree
* \param skip_index_0 (Optional) True to skip index 0.
*/
void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
float colsample_bylevel, float colsample_bytree) {
void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
feature_weights_ = std::move(feature_weights);
colsample_bylevel_ = colsample_bylevel;
colsample_bytree_ = colsample_bytree;
colsample_bynode_ = colsample_bynode;
ctx_ = ctx;
if (feature_set_tree_ == nullptr) {
feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();

View File

@@ -77,14 +77,14 @@ class RowSetCollection {
if (row_indices_.empty()) { // edge case: empty instance set
constexpr size_t* kBegin = nullptr;
constexpr size_t* kEnd = nullptr;
static_assert(kEnd - kBegin == 0, "");
elem_of_each_node_.emplace_back(Elem(kBegin, kEnd, 0));
static_assert(kEnd - kBegin == 0);
elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
return;
}
const size_t* begin = dmlc::BeginPtr(row_indices_);
const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
elem_of_each_node_.emplace_back(Elem(begin, end, 0));
elem_of_each_node_.emplace_back(begin, end, 0);
}
std::vector<size_t>* Data() { return &row_indices_; }

View File

@@ -35,11 +35,11 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
auto iter = linalg::cbegin(ti_v);
float q{0};
if (opt_weights.Empty()) {
q = common::Quantile(0.5, iter, iter + ti_v.Size());
q = common::Quantile(ctx, 0.5, iter, iter + ti_v.Size());
} else {
CHECK_NE(t_v.Shape(1), 0);
auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
q = common::WeightedQuantile(ctx, 0.5, iter, iter + ti_v.Size(), w_it);
}
h_out(i) = q;
}

View File

@@ -4,46 +4,52 @@
#ifndef XGBOOST_COMMON_STATS_H_
#define XGBOOST_COMMON_STATS_H_
#include <algorithm>
#include <iterator>
#include <iterator> // for distance
#include <limits>
#include <vector>
#include "algorithm.h" // for StableSort
#include "common.h" // AssertGPUSupport, OptionalWeights
#include "optional_weight.h" // OptionalWeights
#include "transform_iterator.h" // MakeIndexTransformIter
#include "xgboost/context.h" // Context
#include "xgboost/linalg.h"
#include "xgboost/logging.h" // CHECK_GE
#include "xgboost/linalg.h" // TensorView,VectorView
#include "xgboost/logging.h" // CHECK_GE
namespace xgboost {
namespace common {
/**
* \brief Percentile with masked array using linear interpolation.
* @brief Quantile using linear interpolation.
*
* https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
*
* \param alpha Percentile, must be in range [0, 1].
* \param alpha Quantile, must be in range [0, 1].
* \param begin Iterator begin for input array.
* \param end Iterator end for input array.
*
* \return The result of interpolation.
*/
template <typename Iter>
float Quantile(double alpha, Iter const& begin, Iter const& end) {
float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const& end) {
CHECK(alpha >= 0 && alpha <= 1);
auto n = static_cast<double>(std::distance(begin, end));
if (n == 0) {
return std::numeric_limits<float>::quiet_NaN();
}
std::vector<size_t> sorted_idx(n);
std::vector<std::size_t> sorted_idx(n);
std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
[&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
if (omp_in_parallel()) {
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
} else {
StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
}
auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
static_assert(std::is_same<decltype(val(0)), float>::value, "");
static_assert(std::is_same<decltype(val(0)), float>::value);
if (alpha <= (1 / (n + 1))) {
return val(0);
@@ -51,7 +57,7 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
if (alpha >= (n / (n + 1))) {
return val(sorted_idx.size() - 1);
}
assert(n != 0 && "The number of rows in a leaf can not be zero.");
double x = alpha * static_cast<double>((n + 1));
double k = std::floor(x) - 1;
CHECK_GE(k, 0);
@@ -66,30 +72,35 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
* \brief Calculate the weighted quantile with step function. Unlike the unweighted
* version, no interpolation is used.
*
* See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
* See https://aakinshin.net/posts/weighted-quantiles/ for some discussions on computing
* weighted quantile with interpolation.
*/
template <typename Iter, typename WeightIter>
float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
float WeightedQuantile(Context const* ctx, double alpha, Iter begin, Iter end, WeightIter w_begin) {
auto n = static_cast<double>(std::distance(begin, end));
if (n == 0) {
return std::numeric_limits<float>::quiet_NaN();
}
std::vector<size_t> sorted_idx(n);
std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
[&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
if (omp_in_parallel()) {
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
} else {
StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
}
auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
std::vector<float> weight_cdf(n); // S_n
// weighted cdf is sorted during construction
weight_cdf[0] = *(weights + sorted_idx[0]);
weight_cdf[0] = *(w_begin + sorted_idx[0]);
for (size_t i = 1; i < n; ++i) {
weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
weight_cdf[i] = weight_cdf[i - 1] + w_begin[sorted_idx[i]];
}
float thresh = weight_cdf.back() * alpha;
size_t idx =
std::size_t idx =
std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
idx = std::min(idx, static_cast<size_t>(n - 1));
return val(idx);

View File

@@ -10,12 +10,13 @@
#include <cstring>
#include "../collective/communicator-inl.h"
#include "../common/algorithm.h" // StableSort
#include "../common/api_entry.h" // XGBAPIThreadLocalEntry
#include "../common/group_data.h"
#include "../common/io.h"
#include "../common/linalg_op.h"
#include "../common/math.h"
#include "../common/numeric.h"
#include "../common/numeric.h" // Iota
#include "../common/threading_utils.h"
#include "../common/version.h"
#include "../data/adapter.h"
@@ -258,6 +259,19 @@ void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<Feat
}
}
const std::vector<size_t>& MetaInfo::LabelAbsSort(Context const* ctx) const {
if (label_order_cache_.size() == labels.Size()) {
return label_order_cache_;
}
label_order_cache_.resize(labels.Size());
common::Iota(ctx, label_order_cache_.begin(), label_order_cache_.end(), 0);
const auto& l = labels.Data()->HostVector();
common::StableSort(ctx, label_order_cache_.begin(), label_order_cache_.end(),
[&l](size_t i1, size_t i2) { return std::abs(l[i1]) < std::abs(l[i2]); });
return label_order_cache_;
}
void MetaInfo::LoadBinary(dmlc::Stream *fi) {
auto version = Version::Load(fi);
auto major = std::get<0>(version);
@@ -898,6 +912,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
if (!cache_file.empty()) {
LOG(FATAL) << "Column-wise data split is not support for external memory.";
}
LOG(CONSOLE) << "Splitting data by column";
auto* sliced = dmat->SliceCol(npart, partid);
delete dmat;
return sliced;

View File

@@ -1,12 +1,14 @@
/*!
* Copyright (c) 2019 by Contributors
/**
* Copyright 2019-2023 by XGBoost Contributors
* \file device_adapter.cuh
*/
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <string>
#include "../common/device_helpers.cuh"
#include "../common/math.h"
#include "adapter.h"
@@ -205,10 +207,10 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
}
});
dh::XGBCachingDeviceAllocator<char> alloc;
size_t row_stride = dh::Reduce(
thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
thrust::maximum<size_t>());
size_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<std::size_t>(0), thrust::maximum<size_t>());
return row_stride;
}
}; // namespace data

View File

@@ -21,13 +21,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
common::Span<float> hess) {
common::Span<float> hess)
: max_numeric_bins_per_feat{max_bins_per_feat} {
CHECK(p_fmat->SingleColBlock());
// We use sorted sketching for approx tree method since it's more efficient in
// computation time (but higher memory usage).
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
max_num_bins = max_bins_per_feat;
const uint32_t nbins = cut.Ptrs().back();
hit_count.resize(nbins, 0);
hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -64,7 +64,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
: row_ptr(info.num_row_ + 1, 0),
hit_count(cuts.TotalBins(), 0),
cut{std::forward<common::HistogramCuts>(cuts)},
max_num_bins(max_bin_per_feat),
max_numeric_bins_per_feat(max_bin_per_feat),
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
#if !defined(XGBOOST_USE_CUDA)
@@ -87,13 +87,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
}
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
bool isDense, double sparse_thresh, int32_t n_threads) {
common::HistogramCuts cuts, int32_t max_bins_per_feat,
bool isDense, double sparse_thresh, int32_t n_threads)
: cut{std::move(cuts)},
max_numeric_bins_per_feat{max_bins_per_feat},
base_rowid{batch.base_rowid},
isDense_{isDense} {
CHECK_GE(n_threads, 1);
base_rowid = batch.base_rowid;
isDense_ = isDense;
cut = cuts;
max_num_bins = max_bins_per_feat;
CHECK_EQ(row_ptr.size(), 0);
// The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size,
@@ -128,12 +128,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
#undef INSTANTIATION_PUSH
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
isDense) {
// compress dense index to uint8
index.SetBinTypeSize(common::kUint8BinsTypeSize);
index.Resize((sizeof(uint8_t)) * n_index);
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
isDense) {
// compress dense index to uint16
index.SetBinTypeSize(common::kUint16BinsTypeSize);
@@ -149,16 +150,24 @@ common::ColumnMatrix const &GHistIndexMatrix::Transpose() const {
return *columns_;
}
bst_bin_t GHistIndexMatrix::GetGindex(size_t ridx, size_t fidx) const {
auto begin = RowIdx(ridx);
if (IsDense()) {
return static_cast<bst_bin_t>(index[begin + fidx]);
}
auto end = RowIdx(ridx + 1);
auto const& cut_ptrs = cut.Ptrs();
auto f_begin = cut_ptrs[fidx];
auto f_end = cut_ptrs[fidx + 1];
return BinarySearchBin(begin, end, index, f_begin, f_end);
}
float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
auto const &values = cut.Values();
auto const &mins = cut.MinValues();
auto const &ptrs = cut.Ptrs();
if (is_cat) {
auto f_begin = ptrs[fidx];
auto f_end = ptrs[fidx + 1];
auto begin = RowIdx(ridx);
auto end = RowIdx(ridx + 1);
auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end);
auto gidx = GetGindex(ridx, fidx);
if (gidx == -1) {
return std::numeric_limits<float>::quiet_NaN();
}

View File

@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
EllpackPage const& in_page, BatchParam const& p)
: max_num_bins{p.max_bin} {
: max_numeric_bins_per_feat{p.max_bin} {
auto page = in_page.Impl();
isDense_ = page->is_dense;

View File

@@ -134,11 +134,15 @@ class GHistIndexMatrix {
std::vector<size_t> hit_count;
/*! \brief The corresponding cuts */
common::HistogramCuts cut;
/*! \brief max_bin for each feature. */
bst_bin_t max_num_bins;
/** \brief max_bin for each feature. */
bst_bin_t max_numeric_bins_per_feat;
/*! \brief base row index for current page (used by external memory) */
size_t base_rowid{0};
bst_bin_t MaxNumBinPerFeat() const {
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
}
~GHistIndexMatrix();
/**
* \brief Constrcutor for SimpleDMatrix.
@@ -161,7 +165,7 @@ class GHistIndexMatrix {
* \brief Constructor for external memory.
*/
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
double sparse_thresh, int32_t n_threads);
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.
@@ -224,6 +228,8 @@ class GHistIndexMatrix {
common::ColumnMatrix const& Transpose() const;
bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
private:

View File

@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
if (!fi->Read(&page->hit_count)) {
return false;
}
if (!fi->Read(&page->max_num_bins)) {
if (!fi->Read(&page->max_numeric_bins_per_feat)) {
return false;
}
if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
sizeof(uint64_t);
// max_bins, base row, is_dense
fo->Write(page.max_num_bins);
bytes += sizeof(page.max_num_bins);
fo->Write(page.max_numeric_bins_per_feat);
bytes += sizeof(page.max_numeric_bins_per_feat);
fo->Write(page.base_rowid);
bytes += sizeof(page.base_rowid);
fo->Write(page.IsDense());

View File

@@ -213,7 +213,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
SyncFeatureType(&h_ft);
p_sketch.reset(new common::HostSketchContainer{
batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
proxy->Info().data_split_mode == DataSplitMode::kCol, ctx_.Threads()});
proxy->IsColumnSplit(), ctx_.Threads()});
}
HostAdapterDispatch(proxy, [&](auto const& batch) {
proxy->Info().num_nonzero_ = batch_nnz[i];

View File

@@ -19,7 +19,7 @@ const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
namespace detail {
// Use device dispatch
std::size_t NSamplesDevice(DMatrixProxy *)
std::size_t NSamplesDevice(DMatrixProxy *) // NOLINT
#if defined(XGBOOST_USE_CUDA)
; // NOLINT
#else
@@ -28,7 +28,7 @@ std::size_t NSamplesDevice(DMatrixProxy *)
return 0;
}
#endif
std::size_t NFeaturesDevice(DMatrixProxy *)
std::size_t NFeaturesDevice(DMatrixProxy *) // NOLINT
#if defined(XGBOOST_USE_CUDA)
; // NOLINT
#else

View File

@@ -75,10 +75,7 @@ class GBLinear : public GradientBooster {
: GradientBooster{ctx},
learner_model_param_{learner_model_param},
model_{learner_model_param},
previous_model_{learner_model_param},
sum_instance_weight_(0),
sum_weight_complete_(false),
is_converged_(false) {}
previous_model_{learner_model_param} {}
void Configure(const Args& cfg) override {
if (model_.weight.size() == 0) {
@@ -344,10 +341,10 @@ class GBLinear : public GradientBooster {
GBLinearModel previous_model_;
GBLinearTrainParam param_;
std::unique_ptr<LinearUpdater> updater_;
double sum_instance_weight_;
bool sum_weight_complete_;
double sum_instance_weight_{};
bool sum_weight_complete_{false};
common::Monitor monitor_;
bool is_converged_;
bool is_converged_{false};
};
// register the objective functions

View File

@@ -47,12 +47,12 @@ class GBLinearModel : public Model {
DeprecatedGBLinearModelParam param_;
public:
int32_t num_boosted_rounds;
int32_t num_boosted_rounds{0};
LearnerModelParam const* learner_model_param;
public:
explicit GBLinearModel(LearnerModelParam const* learner_model_param) :
num_boosted_rounds{0}, learner_model_param {learner_model_param} {}
explicit GBLinearModel(LearnerModelParam const *learner_model_param)
: learner_model_param{learner_model_param} {}
void Configure(Args const &) { }
// weight for each of feature, bias is the last one

View File

@@ -32,15 +32,14 @@
#include "xgboost/string_view.h"
#include "xgboost/tree_updater.h"
namespace xgboost {
namespace gbm {
namespace xgboost::gbm {
DMLC_REGISTRY_FILE_TAG(gbtree);
void GBTree::Configure(const Args& cfg) {
void GBTree::Configure(Args const& cfg) {
this->cfg_ = cfg;
std::string updater_seq = tparam_.updater_seq;
tparam_.UpdateAllowUnknown(cfg);
tree_param_.UpdateAllowUnknown(cfg);
model_.Configure(cfg);
@@ -235,9 +234,11 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
CHECK_EQ(model_.param.num_parallel_tree, trees.size());
CHECK_EQ(model_.param.num_parallel_tree, 1)
<< "Boosting random forest is not supported for current objective.";
CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
auto const& position = node_position.at(tree_idx);
obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, group_idx, trees[tree_idx].get());
obj->UpdateTreeLeaf(position, p_fmat->Info(), tree_param_.learning_rate / trees.size(),
predictions, group_idx, trees[tree_idx].get());
}
}
@@ -388,9 +389,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
CHECK(out_position);
out_position->resize(new_trees.size());
// Rescale learning rate according to the size of trees
auto lr = tree_param_.learning_rate;
tree_param_.learning_rate /= static_cast<float>(new_trees.size());
for (auto& up : updaters_) {
up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
up->Update(&tree_param_, gpair, p_fmat,
common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
}
tree_param_.learning_rate = lr;
}
void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
@@ -404,6 +411,8 @@ void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& ne
void GBTree::LoadConfig(Json const& in) {
CHECK_EQ(get<String>(in["name"]), "gbtree");
FromJson(in["gbtree_train_param"], &tparam_);
FromJson(in["tree_train_param"], &tree_param_);
// Process type cannot be kUpdate from loaded model
// This would cause all trees to be pushed to trees_to_update
// e.g. updating a model, then saving and loading it would result in an empty model
@@ -451,6 +460,7 @@ void GBTree::SaveConfig(Json* p_out) const {
auto& out = *p_out;
out["name"] = String("gbtree");
out["gbtree_train_param"] = ToJson(tparam_);
out["tree_train_param"] = ToJson(tree_param_);
// Process type cannot be kUpdate from loaded model
// This would cause all trees to be pushed to trees_to_update
@@ -1058,5 +1068,4 @@ XGBOOST_REGISTER_GBM(Dart, "dart")
GBTree* p = new Dart(booster_config, ctx);
return p;
});
} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm

View File

@@ -20,6 +20,7 @@
#include "../common/common.h"
#include "../common/timer.h"
#include "../tree/param.h" // TrainParam
#include "gbtree_model.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
@@ -405,8 +406,8 @@ class GBTree : public GradientBooster {
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
}
std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const override {
[[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const override {
return model_.DumpModel(fmap, with_stats, this->ctx_->Threads(), format);
}
@@ -428,6 +429,8 @@ class GBTree : public GradientBooster {
GBTreeModel model_;
// training parameter
GBTreeTrainParam tparam_;
// Tree training parameter
tree::TrainParam tree_param_;
// ----training fields----
bool showed_updater_warning_ {false};
bool specified_updater_ {false};

View File

@@ -21,7 +21,7 @@
#include <sstream>
#include <stack>
#include <string>
#include <utility>
#include <utility> // for as_const
#include <vector>
#include "collective/communicator-inl.h"
@@ -257,11 +257,11 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
: LearnerModelParam{user_param, t} {
std::swap(base_score_, base_margin);
// Make sure read access everywhere for thread-safe prediction.
common::AsConst(base_score_).HostView();
std::as_const(base_score_).HostView();
if (!ctx->IsCPU()) {
common::AsConst(base_score_).View(ctx->gpu_id);
std::as_const(base_score_).View(ctx->gpu_id);
}
CHECK(common::AsConst(base_score_).Data()->HostCanRead());
CHECK(std::as_const(base_score_).Data()->HostCanRead());
}
linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
@@ -287,9 +287,9 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
base_score_.Reshape(that.base_score_.Shape());
base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
base_score_.Data()->Copy(*that.base_score_.Data());
common::AsConst(base_score_).HostView();
std::as_const(base_score_).HostView();
if (that.base_score_.DeviceIdx() != Context::kCpuId) {
common::AsConst(base_score_).View(that.base_score_.DeviceIdx());
std::as_const(base_score_).View(that.base_score_.DeviceIdx());
}
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
CHECK(base_score_.Data()->HostCanRead());
@@ -328,9 +328,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
using LearnerAPIThreadLocalStore =
dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
using ThreadLocalPredictionCache =
dmlc::ThreadLocalStore<std::map<Learner const *, PredictionContainer>>;
namespace {
StringView ModelMsg() {
return StringView{
@@ -368,6 +365,8 @@ class LearnerConfiguration : public Learner {
LearnerModelParam learner_model_param_;
LearnerTrainParam tparam_;
// Initial prediction.
PredictionContainer prediction_container_;
std::vector<std::string> metric_names_;
void ConfigureModelParamWithoutBaseScore() {
@@ -426,22 +425,15 @@ class LearnerConfiguration : public Learner {
}
public:
explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix> > cache)
explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix>> cache)
: need_configuration_{true} {
monitor_.Init("Learner");
auto& local_cache = (*ThreadLocalPredictionCache::Get())[this];
for (std::shared_ptr<DMatrix> const& d : cache) {
if (d) {
local_cache.Cache(d, Context::kCpuId);
prediction_container_.Cache(d, Context::kCpuId);
}
}
}
~LearnerConfiguration() override {
auto local_cache = ThreadLocalPredictionCache::Get();
if (local_cache->find(this) != local_cache->cend()) {
local_cache->erase(this);
}
}
// Configuration before data is known.
void Configure() override {
@@ -499,10 +491,6 @@ class LearnerConfiguration : public Learner {
CHECK_NE(learner_model_param_.BaseScore(this->Ctx()).Size(), 0) << ModelNotFitted();
}
virtual PredictionContainer* GetPredictionCache() const {
return &((*ThreadLocalPredictionCache::Get())[this]);
}
void LoadConfig(Json const& in) override {
// If configuration is loaded, ensure that the model came from the same version
CHECK(IsA<Object>(in));
@@ -741,11 +729,10 @@ class LearnerConfiguration : public Learner {
if (mparam_.num_feature == 0) {
// TODO(hcho3): Change num_feature to 64-bit integer
unsigned num_feature = 0;
auto local_cache = this->GetPredictionCache();
for (auto& matrix : local_cache->Container()) {
CHECK(matrix.first);
for (auto const& matrix : prediction_container_.Container()) {
CHECK(matrix.first.ptr);
CHECK(!matrix.second.ref.expired());
const uint64_t num_col = matrix.first->Info().num_col_;
const uint64_t num_col = matrix.first.ptr->Info().num_col_;
CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
<< "Unfortunately, XGBoost does not support data matrices with "
<< std::numeric_limits<unsigned>::max() << " features or greater";
@@ -817,13 +804,13 @@ class LearnerConfiguration : public Learner {
*/
void ConfigureTargets() {
CHECK(this->obj_);
auto const& cache = this->GetPredictionCache()->Container();
auto const& cache = prediction_container_.Container();
size_t n_targets = 1;
for (auto const& d : cache) {
if (n_targets == 1) {
n_targets = this->obj_->Targets(d.first->Info());
n_targets = this->obj_->Targets(d.first.ptr->Info());
} else {
auto t = this->obj_->Targets(d.first->Info());
auto t = this->obj_->Targets(d.first.ptr->Info());
CHECK(n_targets == t || 1 == t) << "Inconsistent labels.";
}
}
@@ -1275,8 +1262,7 @@ class LearnerImpl : public LearnerIO {
this->ValidateDMatrix(train.get(), true);
auto local_cache = this->GetPredictionCache();
auto& predt = local_cache->Cache(train, ctx_.gpu_id);
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
monitor_.Start("PredictRaw");
this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1303,8 +1289,7 @@ class LearnerImpl : public LearnerIO {
this->ValidateDMatrix(train.get(), true);
auto local_cache = this->GetPredictionCache();
auto& predt = local_cache->Cache(train, ctx_.gpu_id);
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
monitor_.Stop("BoostOneIter");
}
@@ -1326,10 +1311,9 @@ class LearnerImpl : public LearnerIO {
metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
}
auto local_cache = this->GetPredictionCache();
for (size_t i = 0; i < data_sets.size(); ++i) {
std::shared_ptr<DMatrix> m = data_sets[i];
auto &predt = local_cache->Cache(m, ctx_.gpu_id);
auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
this->ValidateDMatrix(m.get(), false);
this->PredictRaw(m.get(), &predt, false, 0, 0);
@@ -1370,8 +1354,7 @@ class LearnerImpl : public LearnerIO {
} else if (pred_leaf) {
gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
} else {
auto local_cache = this->GetPredictionCache();
auto& prediction = local_cache->Cache(data, ctx_.gpu_id);
auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
// Copy the prediction cache to output prediction. out_preds comes from C API
out_preds->SetDevice(ctx_.gpu_id);

View File

@@ -14,9 +14,11 @@
#include <utility>
#include <vector>
#include "../common/algorithm.h" // ArgSort
#include "../common/math.h"
#include "../common/optional_weight.h" // OptionalWeights
#include "metric_common.h" // MetricNoCache
#include "xgboost/context.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/linalg.h"
#include "xgboost/metric.h"
@@ -77,9 +79,8 @@ BinaryAUC(common::Span<float const> predts, linalg::VectorView<float const> labe
* Machine Learning Models
*/
template <typename BinaryAUC>
double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
size_t n_classes, int32_t n_threads,
BinaryAUC &&binary_auc) {
double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
CHECK_NE(n_classes, 0);
auto const labels = info.labels.View(Context::kCpuId);
if (labels.Shape(0) != 0) {
@@ -108,7 +109,7 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
}
double fp;
std::tie(fp, tp(c), auc(c)) =
binary_auc(proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
local_area(c) = fp * tp(c);
});
}
@@ -139,23 +140,26 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
return auc_sum;
}
std::tuple<double, double, double> BinaryROCAUC(common::Span<float const> predts,
std::tuple<double, double, double> BinaryROCAUC(Context const *ctx,
common::Span<float const> predts,
linalg::VectorView<float const> labels,
common::OptionalWeights weights) {
auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
auto const sorted_idx =
common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
return BinaryAUC(predts, labels, weights, sorted_idx, TrapezoidArea);
}
/**
* Calculate AUC for 1 ranking group;
*/
double GroupRankingROC(common::Span<float const> predts,
double GroupRankingROC(Context const* ctx, common::Span<float const> predts,
linalg::VectorView<float const> labels, float w) {
// on ranking, we just count all pairs.
double auc{0};
// argsort doesn't support tensor input yet.
auto raw_labels = labels.Values().subspan(0, labels.Size());
auto const sorted_idx = common::ArgSort<size_t>(raw_labels, std::greater<>{});
auto const sorted_idx = common::ArgSort<size_t>(
ctx, raw_labels.data(), raw_labels.data() + raw_labels.size(), std::greater<>{});
w = common::Sqr(w);
double sum_w = 0.0f;
@@ -185,10 +189,11 @@ double GroupRankingROC(common::Span<float const> predts,
*
* https://doi.org/10.1371/journal.pone.0092209
*/
std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
std::tuple<double, double, double> BinaryPRAUC(Context const *ctx, common::Span<float const> predts,
linalg::VectorView<float const> labels,
common::OptionalWeights weights) {
auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
auto const sorted_idx =
common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
double total_pos{0}, total_neg{0};
for (size_t i = 0; i < labels.Size(); ++i) {
auto w = weights[i];
@@ -211,9 +216,8 @@ std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
* Cast LTR problem to binary classification problem by comparing pairs.
*/
template <bool is_roc>
std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
MetaInfo const &info,
int32_t n_threads) {
std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> const &predts,
MetaInfo const &info, int32_t n_threads) {
CHECK_GE(info.group_ptr_.size(), 2);
uint32_t n_groups = info.group_ptr_.size() - 1;
auto s_predts = common::Span<float const>{predts};
@@ -237,9 +241,9 @@ std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
auc = 0;
} else {
if (is_roc) {
auc = GroupRankingROC(g_predts, g_labels, w);
auc = GroupRankingROC(ctx, g_predts, g_labels, w);
} else {
auc = std::get<2>(BinaryPRAUC(g_predts, g_labels, common::OptionalWeights{w}));
auc = std::get<2>(BinaryPRAUC(ctx, g_predts, g_labels, common::OptionalWeights{w}));
}
if (std::isnan(auc)) {
invalid_groups++;
@@ -344,7 +348,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
auto n_threads = ctx_->Threads();
if (ctx_->gpu_id == Context::kCpuId) {
std::tie(auc, valid_groups) =
RankingAUC<true>(predts.ConstHostVector(), info, n_threads);
RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
} else {
std::tie(auc, valid_groups) =
GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
@@ -358,8 +362,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
auto n_threads = ctx_->Threads();
CHECK_NE(n_classes, 0);
if (ctx_->gpu_id == Context::kCpuId) {
auc = MultiClassOVR(predts.ConstHostVector(), info, n_classes, n_threads,
BinaryROCAUC);
auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
} else {
auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
}
@@ -370,9 +373,9 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
double fp, tp, auc;
if (ctx_->gpu_id == Context::kCpuId) {
std::tie(fp, tp, auc) =
BinaryROCAUC(predts.ConstHostVector(), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
} else {
std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
ctx_->gpu_id, &this->d_cache_);
@@ -422,7 +425,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
double pr, re, auc;
if (ctx_->gpu_id == Context::kCpuId) {
std::tie(pr, re, auc) =
BinaryPRAUC(predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
} else {
std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
@@ -435,8 +438,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
size_t n_classes) {
if (ctx_->gpu_id == Context::kCpuId) {
auto n_threads = this->ctx_->Threads();
return MultiClassOVR(predts.ConstHostSpan(), info, n_classes, n_threads,
BinaryPRAUC);
return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
} else {
return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
}
@@ -453,7 +455,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
InvalidLabels();
}
std::tie(auc, valid_groups) =
RankingAUC<false>(predts.ConstHostVector(), info, n_threads);
RankingAUC<false>(ctx_, predts.ConstHostVector(), info, n_threads);
} else {
std::tie(auc, valid_groups) =
GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);

View File

@@ -5,7 +5,7 @@
#include <algorithm>
#include <cassert>
#include <cub/cub.cuh>
#include <cub/cub.cuh> // NOLINT
#include <limits>
#include <memory>
#include <tuple>

View File

@@ -451,9 +451,8 @@ class QuantileError : public MetricNoCache {
auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
CHECK_NE(n_targets, 0);
auto y_predt = linalg::MakeTensorView(
ctx->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan(),
{static_cast<std::size_t>(info.num_row_), alpha_.Size(), n_targets}, ctx->gpu_id);
auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
alpha_.Size(), n_targets);
info.weights_.SetDevice(ctx->gpu_id);
common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()

View File

@@ -6,6 +6,7 @@
#define XGBOOST_METRIC_METRIC_COMMON_H_
#include <limits>
#include <memory> // shared_ptr
#include <string>
#include "../common/common.h"

View File

@@ -27,6 +27,7 @@
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/algorithm.h" // Sort
#include "../common/math.h"
#include "../common/ranking_utils.h" // MakeMetricName
#include "../common/threading_utils.h"
@@ -113,7 +114,7 @@ struct EvalAMS : public MetricNoCache {
const auto &h_preds = preds.ConstHostVector();
common::ParallelFor(ndata, ctx_->Threads(),
[&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); });
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst);
auto ntop = static_cast<unsigned>(ratio_ * ndata);
if (ntop == 0) ntop = ndata;
const double br = 10.0;
@@ -330,7 +331,7 @@ struct EvalCox : public MetricNoCache {
using namespace std; // NOLINT(*)
const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());
const auto &label_order = info.LabelAbsSort();
const auto &label_order = info.LabelAbsSort(ctx_);
// pre-compute a sum for the denominator
double exp_p_sum = 0; // we use double because we might need the precision with large datasets

View File

@@ -3,27 +3,34 @@
*/
#include "adaptive.h"
#include <limits>
#include <vector>
#include <algorithm> // std::transform,std::find_if,std::copy,std::unique
#include <cmath> // std::isnan
#include <cstddef> // std::size_t
#include <iterator> // std::distance
#include <vector> // std::vector
#include "../common/common.h"
#include "../common/numeric.h"
#include "../common/stats.h"
#include "../common/threading_utils.h"
#include "../common/algorithm.h" // ArgSort
#include "../common/common.h" // AssertGPUSupport
#include "../common/numeric.h" // RunLengthEncode
#include "../common/stats.h" // Quantile,WeightedQuantile
#include "../common/threading_utils.h" // ParallelFor
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "xgboost/linalg.h"
#include "xgboost/tree_model.h"
#include "xgboost/base.h" // bst_node_t
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/linalg.h" // MakeTensorView
#include "xgboost/span.h" // Span
#include "xgboost/tree_model.h" // RegTree
namespace xgboost {
namespace obj {
namespace detail {
void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& position,
std::vector<size_t>* p_nptr, std::vector<bst_node_t>* p_nidx,
std::vector<size_t>* p_ridx) {
namespace xgboost::obj::detail {
void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
std::vector<bst_node_t> const& position, std::vector<size_t>* p_nptr,
std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_ridx) {
auto& nptr = *p_nptr;
auto& nidx = *p_nidx;
auto& ridx = *p_ridx;
ridx = common::ArgSort<size_t>(position);
ridx = common::ArgSort<size_t>(ctx, position.cbegin(), position.cend());
std::vector<bst_node_t> sorted_pos(position);
// permutation
for (size_t i = 0; i < position.size(); ++i) {
@@ -67,18 +74,18 @@ void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& posi
}
void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
std::int32_t group_idx, MetaInfo const& info,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
auto& tree = *p_tree;
std::vector<bst_node_t> nidx;
std::vector<size_t> nptr;
std::vector<size_t> ridx;
EncodeTreeLeafHost(*p_tree, position, &nptr, &nidx, &ridx);
EncodeTreeLeafHost(ctx, *p_tree, position, &nptr, &nidx, &ridx);
size_t n_leaf = nidx.size();
if (nptr.empty()) {
std::vector<float> quantiles;
UpdateLeafValues(&quantiles, nidx, p_tree);
UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
return;
}
@@ -89,8 +96,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
auto const& h_node_idx = nidx;
auto const& h_node_ptr = nptr;
CHECK_LE(h_node_ptr.back(), info.num_row_);
auto h_predt = linalg::MakeTensorView(predt.ConstHostSpan(),
{info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
predt.Size() / info.num_row_);
// loop over each leaf
common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
@@ -99,8 +106,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
CHECK_LT(k + 1, h_node_ptr.size());
size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
CHECK_LE(group_idx, info.labels.Shape(1));
auto h_labels = info.labels.HostView().Slice(linalg::All(), group_idx);
auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
auto h_weights = linalg::MakeVec(&info.weights_);
auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
@@ -114,9 +121,9 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
float q{0};
if (info.weights_.Empty()) {
q = common::Quantile(alpha, iter, iter + h_row_set.size());
q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
} else {
q = common::WeightedQuantile(alpha, iter, iter + h_row_set.size(), w_it);
q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
}
if (std::isnan(q)) {
CHECK(h_row_set.empty());
@@ -124,8 +131,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
quantiles.at(k) = q;
});
UpdateLeafValues(&quantiles, nidx, p_tree);
UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
}
} // namespace detail
} // namespace obj
} // namespace xgboost
#if !defined(XGBOOST_USE_CUDA)
void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, std::int32_t,
MetaInfo const&, float, HostDeviceVector<float> const&, float, RegTree*) {
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace xgboost::obj::detail

View File

@@ -3,8 +3,8 @@
*/
#include <thrust/sort.h>
#include <cstdint> // std::int32_t
#include <cub/cub.cuh>
#include <cstdint> // std::int32_t
#include <cub/cub.cuh> // NOLINT
#include "../common/cuda_context.cuh" // CUDAContext
#include "../common/device_helpers.cuh"
@@ -20,20 +20,19 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
// copy position to buffer
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
auto cuctx = ctx->CUDACtx();
size_t n_samples = position.size();
dh::XGBDeviceAllocator<char> alloc;
dh::device_vector<bst_node_t> sorted_position(position.size());
dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
position.size_bytes(), cudaMemcpyDeviceToDevice));
position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
p_ridx->resize(position.size());
dh::Iota(dh::ToSpan(*p_ridx));
// sort row index according to node index
thrust::stable_sort_by_key(thrust::cuda::par(alloc), sorted_position.begin(),
thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
sorted_position.begin() + n_samples, p_ridx->begin());
dh::XGBCachingDeviceAllocator<char> caching;
size_t beg_pos =
thrust::find_if(thrust::cuda::par(caching), sorted_position.cbegin(), sorted_position.cend(),
thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
[] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
sorted_position.cbegin();
if (beg_pos == sorted_position.size()) {
@@ -72,7 +71,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());
dh::CUDAEvent e;
e.Record(dh::DefaultStream());
e.Record(cuctx->Stream());
copy_stream.View().Wait(e);
// flag for whether there's ignored position
bst_node_t* h_first_unique =
@@ -108,7 +107,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
d_node_ptr[0] = beg_pos;
}
});
thrust::inclusive_scan(thrust::cuda::par(caching), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
thrust::inclusive_scan(cuctx->CTP(), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
dh::tbegin(d_node_ptr));
copy_stream.View().Sync();
CHECK_GT(*h_num_runs, 0);
@@ -141,7 +140,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
}
void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
std::int32_t group_idx, MetaInfo const& info,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
dh::device_vector<size_t> ridx;
@@ -152,17 +151,17 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
if (nptr.Empty()) {
std::vector<float> quantiles;
UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree);
UpdateLeafValues(&quantiles, nidx.ConstHostVector(), learning_rate, p_tree);
}
HostDeviceVector<float> quantiles;
predt.SetDevice(ctx->gpu_id);
auto d_predt = linalg::MakeTensorView(predt.ConstDeviceSpan(),
{info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
predt.Size() / info.num_row_);
CHECK_LT(group_idx, d_predt.Shape(1));
auto t_predt = d_predt.Slice(linalg::All(), group_idx);
auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), group_idx);
auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));
auto d_row_index = dh::ToSpan(ridx);
auto seg_beg = nptr.DevicePointer();
@@ -187,7 +186,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
w_it + d_weights.size(), &quantiles);
}
UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree);
UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), learning_rate, p_tree);
}
} // namespace detail
} // namespace obj

View File

@@ -6,13 +6,15 @@
#include <algorithm>
#include <cstdint> // std::int32_t
#include <limits>
#include <vector>
#include <vector> // std::vector
#include "../collective/communicator-inl.h"
#include "../common/common.h"
#include "xgboost/context.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/tree_model.h"
#include "xgboost/base.h" // bst_node_t
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/tree_model.h" // RegTree
namespace xgboost {
namespace obj {
@@ -34,7 +36,7 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
}
inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
RegTree* p_tree) {
float learning_rate, RegTree* p_tree) {
auto& tree = *p_tree;
auto& quantiles = *p_quantiles;
auto const& h_node_idx = nidx;
@@ -69,17 +71,39 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
auto nidx = h_node_idx[i];
auto q = quantiles[i];
CHECK(tree[nidx].IsLeaf());
tree[nidx].SetLeaf(q);
tree[nidx].SetLeaf(q * learning_rate);
}
}
inline std::size_t IdxY(MetaInfo const& info, bst_group_t group_idx) {
std::size_t y_idx{0};
if (info.labels.Shape(1) > 1) {
y_idx = group_idx;
}
CHECK_LE(y_idx, info.labels.Shape(1));
return y_idx;
}
void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
std::int32_t group_idx, MetaInfo const& info,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
std::int32_t group_idx, MetaInfo const& info,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
} // namespace detail
inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
if (ctx->IsCPU()) {
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
predt, alpha, p_tree);
} else {
position.SetDevice(ctx->gpu_id);
detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
predt, alpha, p_tree);
}
}
} // namespace obj
} // namespace xgboost

View File

@@ -0,0 +1,44 @@
/**
* Copyright 2022-2023 by XGBoost contributors
*/
#include "init_estimation.h"
#include <memory> // unique_ptr
#include "../common/stats.h" // Mean
#include "../tree/fit_stump.h" // FitStump
#include "xgboost/base.h" // GradientPair
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/json.h" // Json
#include "xgboost/linalg.h" // Tensor,Vector
#include "xgboost/task.h" // ObjInfo
namespace xgboost {
namespace obj {
void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
if (this->Task().task == ObjInfo::kRegression) {
CheckInitInputs(info);
}
// Avoid altering any state in child objective.
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
Json config{Object{}};
this->SaveConfig(&config);
std::unique_ptr<ObjFunction> new_obj{
ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
new_obj->LoadConfig(config);
new_obj->GetGradient(dummy_predt, info, 0, &gpair);
bst_target_t n_targets = this->Targets(info);
linalg::Vector<float> leaf_weight;
tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
// workaround, we don't support multi-target due to binary model serialization for
// base margin.
common::Mean(this->ctx_, leaf_weight, base_score);
this->PredTransform(base_score->Data());
}
} // namespace obj
} // namespace xgboost

View File

@@ -0,0 +1,25 @@
/**
* Copyright 2022-2023 by XGBoost contributors
*/
#ifndef XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
#define XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
#include "xgboost/data.h" // MetaInfo
#include "xgboost/linalg.h" // Tensor
#include "xgboost/objective.h" // ObjFunction
namespace xgboost {
namespace obj {
class FitIntercept : public ObjFunction {
void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override;
};
inline void CheckInitInputs(MetaInfo const& info) {
CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
if (!info.weights_.Empty()) {
CHECK_EQ(info.weights_.Size(), info.num_row_)
<< "Number of weights should be equal to number of data points.";
}
}
} // namespace obj
} // namespace xgboost
#endif // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_

View File

@@ -44,11 +44,13 @@ namespace obj {
// List of files that will be force linked in static links.
#ifdef XGBOOST_USE_CUDA
DMLC_REGISTRY_LINK_TAG(regression_obj_gpu);
DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
#else
DMLC_REGISTRY_LINK_TAG(regression_obj);
DMLC_REGISTRY_LINK_TAG(quantile_obj);
DMLC_REGISTRY_LINK_TAG(hinge_obj);
DMLC_REGISTRY_LINK_TAG(multiclass_obj);
DMLC_REGISTRY_LINK_TAG(rank_obj);

View File

@@ -0,0 +1,18 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
// Dummy file to enable the CUDA conditional compile trick.
#include <dmlc/registry.h>
namespace xgboost {
namespace obj {
DMLC_REGISTRY_FILE_TAG(quantile_obj);
} // namespace obj
} // namespace xgboost
#ifndef XGBOOST_USE_CUDA
#include "quantile_obj.cu"
#endif // !defined(XBGOOST_USE_CUDA)

View File

@@ -0,0 +1,222 @@
/**
* Copyright 2023 by XGBoost contributors
*/
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include <vector> // std::vector
#include "../common/linalg_op.h" // ElementWiseKernel,cbegin,cend
#include "../common/quantile_loss_utils.h" // QuantileLossParam
#include "../common/stats.h" // Quantile,WeightedQuantile
#include "adaptive.h" // UpdateTreeLeaf
#include "dmlc/parameter.h" // DMLC_DECLARE_PARAMETER
#include "init_estimation.h" // CheckInitInputs
#include "xgboost/base.h" // GradientPair,XGBOOST_DEVICE,bst_target_t
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/json.h" // Json,String,ToJson,FromJson
#include "xgboost/linalg.h" // Tensor,MakeTensorView,MakeVec
#include "xgboost/objective.h" // ObjFunction
#include "xgboost/parameter.h" // XGBoostParameter
#if defined(XGBOOST_USE_CUDA)
#include "../common/linalg_op.cuh" // ElementWiseKernel
#include "../common/stats.cuh" // SegmentedQuantile
#endif // defined(XGBOOST_USE_CUDA)
namespace xgboost {
namespace obj {
class QuantileRegression : public ObjFunction {
common::QuantileLossParam param_;
HostDeviceVector<float> alpha_;
bst_target_t Targets(MetaInfo const& info) const override {
auto const& alpha = param_.quantile_alpha.Get();
CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target is not yet supported by the quantile loss.";
CHECK(!alpha.empty());
// We have some placeholders for multi-target in the quantile loss. But it's not
// supported as the gbtree doesn't know how to slice the gradient and there's no 3-dim
// model shape in general.
auto n_y = std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
return alpha_.Size() * n_y;
}
public:
void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, std::int32_t iter,
HostDeviceVector<GradientPair>* out_gpair) override {
if (iter == 0) {
CheckInitInputs(info);
}
CHECK_EQ(param_.quantile_alpha.Get().size(), alpha_.Size());
using SizeT = decltype(info.num_row_);
SizeT n_targets = this->Targets(info);
SizeT n_alphas = alpha_.Size();
CHECK_NE(n_alphas, 0);
CHECK_GE(n_targets, n_alphas);
CHECK_EQ(preds.Size(), info.num_row_ * n_targets);
auto labels = info.labels.View(ctx_->gpu_id);
out_gpair->SetDevice(ctx_->gpu_id);
out_gpair->Resize(n_targets * info.num_row_);
auto gpair =
linalg::MakeTensorView(ctx_, out_gpair, info.num_row_, n_alphas, n_targets / n_alphas);
info.weights_.SetDevice(ctx_->gpu_id);
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};
preds.SetDevice(ctx_->gpu_id);
auto predt = linalg::MakeVec(&preds);
auto n_samples = info.num_row_;
alpha_.SetDevice(ctx_->gpu_id);
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
linalg::ElementWiseKernel(
ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
auto [sample_id, quantile_id, target_id] =
linalg::UnravelIndex(i, n_samples, alpha.size(), n_targets / alpha.size());
auto d = predt(i) - labels(sample_id, target_id);
auto h = weight[sample_id];
if (d >= 0) {
auto g = (1.0f - alpha[quantile_id]) * weight[sample_id];
gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
} else {
auto g = (-alpha[quantile_id] * weight[sample_id]);
gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
}
});
}
void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override {
CHECK(!alpha_.Empty());
auto n_targets = this->Targets(info);
base_score->SetDevice(ctx_->gpu_id);
base_score->Reshape(n_targets);
double sw{0};
if (ctx_->IsCPU()) {
auto quantiles = base_score->HostView();
auto h_weights = info.weights_.ConstHostVector();
if (info.weights_.Empty()) {
sw = info.num_row_;
} else {
sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
}
for (bst_target_t t{0}; t < n_targets; ++t) {
auto alpha = param_.quantile_alpha[t];
auto h_labels = info.labels.HostView();
if (h_weights.empty()) {
quantiles(t) =
common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels));
} else {
CHECK_EQ(h_weights.size(), h_labels.Size());
quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels),
linalg::cend(h_labels), std::cbegin(h_weights));
}
}
} else {
#if defined(XGBOOST_USE_CUDA)
alpha_.SetDevice(ctx_->gpu_id);
auto d_alpha = alpha_.ConstDeviceSpan();
auto d_labels = info.labels.View(ctx_->gpu_id);
auto seg_it = dh::MakeTransformIterator<std::size_t>(
thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
CHECK_EQ(d_labels.Shape(1), 1);
auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) {
auto sample_idx = i % d_labels.Shape(0);
return d_labels(sample_idx, 0);
});
auto n = d_labels.Size() * d_alpha.size();
CHECK_EQ(base_score->Size(), d_alpha.size());
if (info.weights_.Empty()) {
common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
val_it + n, base_score->Data());
sw = info.num_row_;
} else {
info.weights_.SetDevice(ctx_->gpu_id);
auto d_weights = info.weights_.ConstDeviceSpan();
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) {
auto sample_idx = i % d_labels.Shape(0);
return d_weights[sample_idx];
});
common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
val_it, val_it + n, weight_it, weight_it + n,
base_score->Data());
sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
thrust::plus<double>{});
}
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA)
}
// For multiple quantiles, we should extend the base score to a vector instead of
// computing the average. For now, this is a workaround.
linalg::Vector<float> temp;
common::Mean(ctx_, *base_score, &temp);
double meanq = temp(0) * sw;
collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
collective::Allreduce<collective::Operation::kSum>(&sw, 1);
meanq /= (sw + kRtEps);
base_score->Reshape(1);
base_score->Data()->Fill(meanq);
}
void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
float learning_rate, HostDeviceVector<float> const& prediction,
std::int32_t group_idx, RegTree* p_tree) const override {
auto alpha = param_.quantile_alpha[group_idx];
::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction,
alpha, p_tree);
}
void Configure(Args const& args) override {
param_.UpdateAllowUnknown(args);
param_.Validate();
this->alpha_.HostVector() = param_.quantile_alpha.Get();
}
ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
static char const* Name() { return "reg:quantileerror"; }
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["name"] = String(Name());
out["quantile_loss_param"] = ToJson(param_);
}
void LoadConfig(Json const& in) override {
CHECK_EQ(get<String const>(in["name"]), Name());
FromJson(in["quantile_loss_param"], &param_);
alpha_.HostVector() = param_.quantile_alpha.Get();
}
const char* DefaultEvalMetric() const override { return "quantile"; }
Json DefaultMetricConfig() const override {
CHECK(param_.GetInitialised());
Json config{Object{}};
config["name"] = String{this->DefaultEvalMetric()};
config["quantile_loss_param"] = ToJson(param_);
return config;
}
};
XGBOOST_REGISTER_OBJECTIVE(QuantileRegression, QuantileRegression::Name())
.describe("Regression with quantile loss.")
.set_body([]() { return new QuantileRegression(); });
#if defined(XGBOOST_USE_CUDA)
DMLC_REGISTRY_FILE_TAG(quantile_obj_gpu);
#endif // defined(XGBOOST_USE_CUDA)
} // namespace obj
} // namespace xgboost

View File

@@ -1,15 +1,16 @@
/*!
* Copyright 2017-2022 XGBoost contributors
/**
* Copyright 2017-2023 by XGBoost contributors
*/
#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
#include <dmlc/omp.h>
#include <xgboost/logging.h>
#include <cmath>
#include "../common/math.h"
#include "xgboost/data.h" // MetaInfo
#include "xgboost/logging.h"
#include "xgboost/task.h" // ObjInfo
namespace xgboost {
@@ -105,7 +106,6 @@ struct LogisticRaw : public LogisticRegression {
static ObjInfo Info() { return ObjInfo::kRegression; }
};
} // namespace obj
} // namespace xgboost

View File

@@ -20,12 +20,12 @@
#include "../common/stats.h"
#include "../common/threading_utils.h"
#include "../common/transform.h"
#include "../tree/fit_stump.h" // FitStump
#include "./regression_loss.h"
#include "adaptive.h"
#include "init_estimation.h" // FitIntercept
#include "xgboost/base.h"
#include "xgboost/context.h"
#include "xgboost/data.h" // MetaInfo
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // MetaInfo
#include "xgboost/host_device_vector.h"
#include "xgboost/json.h"
#include "xgboost/linalg.h"
@@ -43,45 +43,12 @@
namespace xgboost {
namespace obj {
namespace {
void CheckInitInputs(MetaInfo const& info) {
CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
if (!info.weights_.Empty()) {
CHECK_EQ(info.weights_.Size(), info.num_row_)
<< "Number of weights should be equal to number of data points.";
}
}
void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
CheckInitInputs(info);
CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
}
} // anonymous namespace
class RegInitEstimation : public ObjFunction {
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) const override {
CheckInitInputs(info);
// Avoid altering any state in child objective.
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
Json config{Object{}};
this->SaveConfig(&config);
std::unique_ptr<ObjFunction> new_obj{
ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
new_obj->LoadConfig(config);
new_obj->GetGradient(dummy_predt, info, 0, &gpair);
bst_target_t n_targets = this->Targets(info);
linalg::Vector<float> leaf_weight;
tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
// workaround, we don't support multi-target due to binary model serialization for
// base margin.
common::Mean(this->ctx_, leaf_weight, base_score);
this->PredTransform(base_score->Data());
}
};
#if defined(XGBOOST_USE_CUDA)
DMLC_REGISTRY_FILE_TAG(regression_obj_gpu);
#endif // defined(XGBOOST_USE_CUDA)
@@ -96,7 +63,7 @@ struct RegLossParam : public XGBoostParameter<RegLossParam> {
};
template<typename Loss>
class RegLossObj : public RegInitEstimation {
class RegLossObj : public FitIntercept {
protected:
HostDeviceVector<float> additional_input_;
@@ -243,7 +210,7 @@ XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
return new RegLossObj<LinearSquareLoss>(); });
// End deprecated
class PseudoHuberRegression : public RegInitEstimation {
class PseudoHuberRegression : public FitIntercept {
PesudoHuberParam param_;
public:
@@ -318,7 +285,7 @@ struct PoissonRegressionParam : public XGBoostParameter<PoissonRegressionParam>
};
// poisson regression for count
class PoissonRegression : public RegInitEstimation {
class PoissonRegression : public FitIntercept {
public:
// declare functions
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -413,7 +380,7 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
// cox regression for survival data (negative values mean they are censored)
class CoxRegression : public RegInitEstimation {
class CoxRegression : public FitIntercept {
public:
void Configure(Args const&) override {}
ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -426,7 +393,7 @@ class CoxRegression : public RegInitEstimation {
const auto& preds_h = preds.HostVector();
out_gpair->Resize(preds_h.size());
auto& gpair = out_gpair->HostVector();
const std::vector<size_t> &label_order = info.LabelAbsSort();
const std::vector<size_t> &label_order = info.LabelAbsSort(ctx_);
const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
const bool is_null_weight = info.weights_.Size() == 0;
@@ -510,7 +477,7 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
.set_body([]() { return new CoxRegression(); });
// gamma regression
class GammaRegression : public RegInitEstimation {
class GammaRegression : public FitIntercept {
public:
void Configure(Args const&) override {}
ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -601,7 +568,7 @@ struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam>
};
// tweedie regression
class TweedieRegression : public RegInitEstimation {
class TweedieRegression : public FitIntercept {
public:
// declare functions
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -775,20 +742,10 @@ class MeanAbsoluteError : public ObjFunction {
}
void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
HostDeviceVector<float> const& prediction, std::int32_t group_idx,
RegTree* p_tree) const override {
if (ctx_->IsCPU()) {
auto const& h_position = position.ConstHostVector();
detail::UpdateTreeLeafHost(ctx_, h_position, group_idx, info, prediction, 0.5, p_tree);
} else {
#if defined(XGBOOST_USE_CUDA)
position.SetDevice(ctx_->gpu_id);
auto d_position = position.ConstDeviceSpan();
detail::UpdateTreeLeafDevice(ctx_, d_position, group_idx, info, prediction, 0.5, p_tree);
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA)
}
float learning_rate, HostDeviceVector<float> const& prediction,
std::int32_t group_idx, RegTree* p_tree) const override {
::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
p_tree);
}
const char* DefaultEvalMetric() const override { return "mae"; }

View File

@@ -164,7 +164,7 @@ struct GHistIndexMatrixView {
SparsePage::Inst operator[](size_t r) {
auto t = omp_get_thread_num();
auto const beg = (n_features_ * kUnroll * t) + (current_unroll_[t] * n_features_);
size_t non_missing{beg};
size_t non_missing{static_cast<std::size_t>(beg)};
for (bst_feature_t c = 0; c < n_features_; ++c) {
float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
@@ -477,7 +477,8 @@ class ColumnSplitHelper {
// auto block_id has the same type as `n_blocks`.
common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
auto const batch_offset = block_id * block_of_rows_size;
auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
static_cast<std::size_t>(block_of_rows_size));
auto const fvec_offset = omp_get_thread_num() * block_of_rows_size;
FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
@@ -490,7 +491,8 @@ class ColumnSplitHelper {
// auto block_id has the same type as `n_blocks`.
common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
auto const batch_offset = block_id * block_of_rows_size;
auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
static_cast<std::size_t>(block_of_rows_size));
PredictAllTrees(out_preds, batch_offset, batch_offset + batch.base_rowid, num_group,
block_size);
});
@@ -584,7 +586,7 @@ class CPUPredictor : public Predictor {
void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
if (p_fmat->Info().data_split_mode == DataSplitMode::kCol) {
if (p_fmat->IsColumnSplit()) {
ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
helper.PredictDMatrix(p_fmat, out_preds);
return;

View File

@@ -3,10 +3,11 @@
*/
#include "cpu_treeshap.h"
#include <cinttypes> // std::uint32_t
#include <algorithm> // copy
#include <cinttypes> // std::uint32_t
#include "predict_fn.h" // GetNextNode
#include "xgboost/base.h" // bst_node_t
#include "predict_fn.h" // GetNextNode
#include "xgboost/base.h" // bst_node_t
#include "xgboost/logging.h"
#include "xgboost/tree_model.h" // RegTree

View File

@@ -1,6 +1,10 @@
#ifndef XGBOOST_PREDICTOR_CPU_TREESHAP_H_
#define XGBOOST_PREDICTOR_CPU_TREESHAP_H_
/**
* Copyright by XGBoost Contributors 2017-2022
*/
#include <vector> // vector
#include "xgboost/tree_model.h" // RegTree
namespace xgboost {
@@ -15,3 +19,4 @@ void CalculateContributions(RegTree const &tree, const RegTree::FVec &feat,
std::vector<float> *mean_values, bst_float *out_contribs, int condition,
unsigned condition_feature);
} // namespace xgboost
#endif // XGBOOST_PREDICTOR_CPU_TREESHAP_H_

View File

@@ -9,6 +9,7 @@
#include <limits> // std::numeric_limits
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/numeric.h" // Iota
#include "../common/partition_builder.h"
#include "hist/expand_entry.h" // CPUExpandEntry
@@ -16,17 +17,73 @@
namespace xgboost {
namespace tree {
class CommonRowPartitioner {
static constexpr size_t kPartitionBlockSize = 2048;
common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
common::RowSetCollection row_set_collection_;
static constexpr size_t kPartitionBlockSize = 2048;
class ColumnSplitHelper {
public:
ColumnSplitHelper() = default;
ColumnSplitHelper(bst_row_t num_row,
common::PartitionBuilder<kPartitionBlockSize>* partition_builder,
common::RowSetCollection* row_set_collection)
: partition_builder_{partition_builder}, row_set_collection_{row_set_collection} {
decision_storage_.resize(num_row);
decision_bits_ = BitVector(common::Span<BitVector::value_type>(decision_storage_));
missing_storage_.resize(num_row);
missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
}
void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
// When data is split by column, we don't have all the feature values in the local worker, so
// we first collect all the decisions and whether the feature is missing into bit vectors.
std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
const int32_t nid = nodes[node_in_set].nid;
partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
(*row_set_collection_)[nid].begin, &decision_bits_,
&missing_bits_);
});
// Then aggregate the bit vectors across all the workers.
collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
decision_storage_.size());
collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
missing_storage_.size());
// Finally use the bit vectors to partition the rows.
common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
size_t begin = r.begin();
const int32_t nid = nodes[node_in_set].nid;
const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
partition_builder_->AllocateForTask(task_id);
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
(*row_set_collection_)[nid].begin, decision_bits_,
missing_bits_);
});
}
private:
using BitVector = RBitField8;
std::vector<BitVector::value_type> decision_storage_{};
BitVector decision_bits_{};
std::vector<BitVector::value_type> missing_storage_{};
BitVector missing_bits_{};
common::PartitionBuilder<kPartitionBlockSize>* partition_builder_;
common::RowSetCollection* row_set_collection_;
};
class CommonRowPartitioner {
public:
bst_row_t base_rowid = 0;
CommonRowPartitioner() = default;
CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid)
: base_rowid{_base_rowid} {
CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid,
bool is_col_split)
: base_rowid{_base_rowid}, is_col_split_{is_col_split} {
row_set_collection_.Clear();
std::vector<size_t>& row_indices = *row_set_collection_.Data();
row_indices.resize(num_row);
@@ -34,6 +91,10 @@ class CommonRowPartitioner {
std::size_t* p_row_indices = row_indices.data();
common::Iota(ctx, p_row_indices, p_row_indices + row_indices.size(), base_rowid);
row_set_collection_.Init();
if (is_col_split_) {
column_split_helper_ = ColumnSplitHelper{num_row, &partition_builder_, &row_set_collection_};
}
}
void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
@@ -156,16 +217,20 @@ class CommonRowPartitioner {
// 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
// Store results in intermediate buffers from partition_builder_
common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
size_t begin = r.begin();
const int32_t nid = nodes[node_in_set].nid;
const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
partition_builder_.AllocateForTask(task_id);
bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
row_set_collection_[nid].begin);
});
if (is_col_split_) {
column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
} else {
common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
size_t begin = r.begin();
const int32_t nid = nodes[node_in_set].nid;
const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
partition_builder_.AllocateForTask(task_id);
bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
row_set_collection_[nid].begin);
});
}
// 3. Compute offsets to copy blocks of row-indexes
// from partition_builder_ to row_set_collection_
@@ -205,6 +270,12 @@ class CommonRowPartitioner {
ctx, tree, this->Partitions(), p_out_position,
[&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
}
private:
common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
common::RowSetCollection row_set_collection_;
bool is_col_split_;
ColumnSplitHelper column_split_helper_;
};
} // namespace tree

View File

@@ -97,7 +97,7 @@ class EvaluateSplitAgent {
idx += kBlockSize) {
local_sum += LoadGpair(node_histogram + idx);
}
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT
// Broadcast result from thread 0
return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
__shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
@@ -359,8 +359,8 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
// One block for each feature
uint32_t constexpr kBlockThreads = 32;
dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
0}(
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads,
0}(
EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
shared_inputs,
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),

View File

@@ -1,15 +1,15 @@
/*!
* Copyright 2020-2021 by XGBoost Contributors
/**
* Copyright 2020-2023 by XGBoost Contributors
*/
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <algorithm>
#include <ctgmath>
#include <cstdint> // uint32_t
#include <limits>
#include "../../common/device_helpers.cuh"
#include "../../common/deterministic.cuh"
#include "../../common/device_helpers.cuh"
#include "../../data/ellpack_page.cuh"
#include "histogram.cuh"
#include "row_partitioner.cuh"
@@ -83,7 +83,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
*/
to_floating_point_ =
histogram_rounding /
T(IntT(1) << (sizeof(typename GradientSumT::ValueT) * 8 - 2)); // keep 1 for sign bit
static_cast<T>(static_cast<IntT>(1)
<< (sizeof(typename GradientSumT::ValueT) * 8 - 2)); // keep 1 for sign bit
/**
* Factor for converting gradients from floating-point to fixed-point. For
* f64:
@@ -93,8 +94,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
* rounding is calcuated as exp(m), see the rounding factor calcuation for
* details.
*/
to_fixed_point_ =
GradientSumT(T(1) / to_floating_point_.GetGrad(), T(1) / to_floating_point_.GetHess());
to_fixed_point_ = GradientSumT(static_cast<T>(1) / to_floating_point_.GetGrad(),
static_cast<T>(1) / to_floating_point_.GetHess());
}
@@ -153,7 +154,8 @@ class HistogramAgent {
d_gpair_(d_gpair) {}
__device__ void ProcessPartialTileShared(std::size_t offset) {
for (std::size_t idx = offset + threadIdx.x;
idx < min(offset + kBlockThreads * kItemsPerTile, n_elements_); idx += kBlockThreads) {
idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
idx += kBlockThreads) {
int ridx = d_ridx_[idx / feature_stride_];
int gidx =
matrix_
@@ -295,11 +297,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
// Allocate number of blocks such that each block has about kMinItemsPerBlock work
// Up to a maximum where the device is saturated
grid_size =
min(grid_size,
unsigned(common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
grid_size = std::min(grid_size, static_cast<std::uint32_t>(
common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
dh::LaunchKernel{dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
gpair.data(), rounding);
};

View File

@@ -130,7 +130,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
std::size_t item_idx;
AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res};
return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
});
size_t temp_bytes = 0;
if (tmp->empty()) {

View File

@@ -1,10 +1,11 @@
/*!
* Copyright 2021-2022 by XGBoost Contributors
/**
* Copyright 2021-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#include <algorithm>
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <numeric>
@@ -16,13 +17,11 @@
#include "../../common/random.h"
#include "../../data/gradient_index.h"
#include "../constraints.h"
#include "../param.h"
#include "../param.h" // for TrainParam
#include "../split_evaluator.h"
#include "xgboost/context.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
template <typename ExpandEntry>
class HistEvaluator {
private:
@@ -34,10 +33,11 @@ class HistEvaluator {
};
private:
TrainParam param_;
Context const* ctx_;
TrainParam const* param_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
TreeEvaluator tree_evaluator_;
int32_t n_threads_ {0};
bool is_col_split_{false};
FeatureInteractionConstraintHost interaction_constraints_;
std::vector<NodeEntry> snode_;
@@ -53,8 +53,9 @@ class HistEvaluator {
}
}
bool IsValid(GradStats const &left, GradStats const &right) const {
return left.GetHess() >= param_.min_child_weight && right.GetHess() >= param_.min_child_weight;
[[nodiscard]] bool IsValid(GradStats const &left, GradStats const &right) const {
return left.GetHess() >= param_->min_child_weight &&
right.GetHess() >= param_->min_child_weight;
}
/**
@@ -93,9 +94,10 @@ class HistEvaluator {
right_sum = GradStats{hist[i]};
left_sum.SetSubstract(parent.stats, right_sum);
if (IsValid(left_sum, right_sum)) {
auto missing_left_chg = static_cast<float>(
evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
parent.root_gain);
auto missing_left_chg =
static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
GradStats{right_sum}) -
parent.root_gain);
best.Update(missing_left_chg, fidx, split_pt, true, true, left_sum, right_sum);
}
@@ -103,9 +105,10 @@ class HistEvaluator {
right_sum.Add(missing);
left_sum.SetSubstract(parent.stats, right_sum);
if (IsValid(left_sum, right_sum)) {
auto missing_right_chg = static_cast<float>(
evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
parent.root_gain);
auto missing_right_chg =
static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
GradStats{right_sum}) -
parent.root_gain);
best.Update(missing_right_chg, fidx, split_pt, false, true, left_sum, right_sum);
}
}
@@ -150,7 +153,7 @@ class HistEvaluator {
bst_bin_t f_begin = cut_ptr[fidx];
bst_bin_t f_end = cut_ptr[fidx + 1];
bst_bin_t n_bins_feature{f_end - f_begin};
auto n_bins = std::min(param_.max_cat_threshold, n_bins_feature);
auto n_bins = std::min(param_->max_cat_threshold, n_bins_feature);
// statistics on both sides of split
GradStats left_sum;
@@ -179,9 +182,9 @@ class HistEvaluator {
right_sum.SetSubstract(parent.stats, left_sum); // missing on right
}
if (IsValid(left_sum, right_sum)) {
auto loss_chg =
evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
parent.root_gain;
auto loss_chg = evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
GradStats{right_sum}) -
parent.root_gain;
// We don't have a numeric split point, nan here is a dummy split.
if (best.Update(loss_chg, fidx, std::numeric_limits<float>::quiet_NaN(), d_step == 1, true,
left_sum, right_sum)) {
@@ -254,7 +257,7 @@ class HistEvaluator {
if (d_step > 0) {
// forward enumeration: split at right bound of each bin
loss_chg =
static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum},
static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
GradStats{right_sum}) -
parent.root_gain);
split_pt = cut_val[i]; // not used for partition based
@@ -262,7 +265,7 @@ class HistEvaluator {
} else {
// backward enumeration: split at left bound of each bin
loss_chg =
static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{right_sum},
static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{right_sum},
GradStats{left_sum}) -
parent.root_gain);
if (i == imin) {
@@ -283,6 +286,7 @@ class HistEvaluator {
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
common::Span<FeatureType const> feature_types, const RegTree &tree,
std::vector<ExpandEntry> *p_entries) {
auto n_threads = ctx_->Threads();
auto& entries = *p_entries;
// All nodes are on the same level, so we can store the shared ptr.
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
@@ -294,23 +298,23 @@ class HistEvaluator {
}
CHECK(!features.empty());
const size_t grain_size =
std::max<size_t>(1, features.front()->Size() / n_threads_);
std::max<size_t>(1, features.front()->Size() / n_threads);
common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
return features[nidx_in_set]->Size();
}, grain_size);
std::vector<ExpandEntry> tloc_candidates(n_threads_ * entries.size());
std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
for (size_t i = 0; i < entries.size(); ++i) {
for (decltype(n_threads_) j = 0; j < n_threads_; ++j) {
tloc_candidates[i * n_threads_ + j] = entries[i];
for (decltype(n_threads) j = 0; j < n_threads; ++j) {
tloc_candidates[i * n_threads + j] = entries[i];
}
}
auto evaluator = tree_evaluator_.GetEvaluator();
auto const& cut_ptrs = cut.Ptrs();
common::ParallelFor2d(space, n_threads_, [&](size_t nidx_in_set, common::Range1d r) {
common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
auto tidx = omp_get_thread_num();
auto entry = &tloc_candidates[n_threads_ * nidx_in_set + tidx];
auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
auto best = &entry->split;
auto nidx = entry->nid;
auto histogram = hist[nidx];
@@ -323,7 +327,7 @@ class HistEvaluator {
}
if (is_cat) {
auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) {
EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best);
} else {
std::vector<size_t> sorted_idx(n_bins);
@@ -331,8 +335,8 @@ class HistEvaluator {
auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins);
// Sort the histogram to get contiguous partitions.
std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) {
auto ret = evaluator.CalcWeightCat(param_, feat_hist[l]) <
evaluator.CalcWeightCat(param_, feat_hist[r]);
auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) <
evaluator.CalcWeightCat(*param_, feat_hist[r]);
return ret;
});
EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
@@ -349,12 +353,29 @@ class HistEvaluator {
for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
++nidx_in_set) {
for (auto tidx = 0; tidx < n_threads_; ++tidx) {
for (auto tidx = 0; tidx < n_threads; ++tidx) {
entries[nidx_in_set].split.Update(
tloc_candidates[n_threads_ * nidx_in_set + tidx].split);
tloc_candidates[n_threads * nidx_in_set + tidx].split);
}
}
if (is_col_split_) {
// With column-wise data split, we gather the best splits from all the workers and update the
// expand entries accordingly.
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
std::vector<ExpandEntry> buffer{num_entries * world};
std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
for (auto worker = 0; worker < world; ++worker) {
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
}
}
}
}
// Add splits to tree, handles all statistic
void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
auto evaluator = tree_evaluator_.GetEvaluator();
@@ -362,24 +383,22 @@ class HistEvaluator {
GradStats parent_sum = candidate.split.left_sum;
parent_sum.Add(candidate.split.right_sum);
auto base_weight =
evaluator.CalcWeight(candidate.nid, param_, GradStats{parent_sum});
auto base_weight = evaluator.CalcWeight(candidate.nid, *param_, GradStats{parent_sum});
auto left_weight =
evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.left_sum});
evaluator.CalcWeight(candidate.nid, *param_, GradStats{candidate.split.left_sum});
auto right_weight =
evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.right_sum});
evaluator.CalcWeight(candidate.nid, *param_, GradStats{candidate.split.right_sum});
if (candidate.split.is_cat) {
tree.ExpandCategorical(
candidate.nid, candidate.split.SplitIndex(), candidate.split.cat_bits,
candidate.split.DefaultLeft(), base_weight, left_weight * param_.learning_rate,
right_weight * param_.learning_rate, candidate.split.loss_chg, parent_sum.GetHess(),
candidate.split.DefaultLeft(), base_weight, left_weight * param_->learning_rate,
right_weight * param_->learning_rate, candidate.split.loss_chg, parent_sum.GetHess(),
candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
} else {
tree.ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
candidate.split.DefaultLeft(), base_weight,
left_weight * param_.learning_rate, right_weight * param_.learning_rate,
left_weight * param_->learning_rate, right_weight * param_->learning_rate,
candidate.split.loss_chg, parent_sum.GetHess(),
candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
}
@@ -395,11 +414,11 @@ class HistEvaluator {
max_node = std::max(candidate.nid, max_node);
snode_.resize(tree.GetNodes().size());
snode_.at(left_child).stats = candidate.split.left_sum;
snode_.at(left_child).root_gain = evaluator.CalcGain(
candidate.nid, param_, GradStats{candidate.split.left_sum});
snode_.at(left_child).root_gain =
evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.left_sum});
snode_.at(right_child).stats = candidate.split.right_sum;
snode_.at(right_child).root_gain = evaluator.CalcGain(
candidate.nid, param_, GradStats{candidate.split.right_sum});
snode_.at(right_child).root_gain =
evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});
interaction_constraints_.Split(candidate.nid,
tree[candidate.nid].SplitIndex(), left_child,
@@ -409,30 +428,31 @@ class HistEvaluator {
auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
auto const& Stats() const { return snode_; }
float InitRoot(GradStats const& root_sum) {
float InitRoot(GradStats const &root_sum) {
snode_.resize(1);
auto root_evaluator = tree_evaluator_.GetEvaluator();
snode_[0].stats = GradStats{root_sum.GetGrad(), root_sum.GetHess()};
snode_[0].root_gain = root_evaluator.CalcGain(RegTree::kRoot, param_,
GradStats{snode_[0].stats});
auto weight = root_evaluator.CalcWeight(RegTree::kRoot, param_,
GradStats{snode_[0].stats});
snode_[0].root_gain =
root_evaluator.CalcGain(RegTree::kRoot, *param_, GradStats{snode_[0].stats});
auto weight = root_evaluator.CalcWeight(RegTree::kRoot, *param_, GradStats{snode_[0].stats});
return weight;
}
public:
// The column sampler must be constructed by caller since we need to preserve the rng
// for the entire training session.
explicit HistEvaluator(TrainParam const &param, MetaInfo const &info, int32_t n_threads,
explicit HistEvaluator(Context const *ctx, TrainParam const *param, MetaInfo const &info,
std::shared_ptr<common::ColumnSampler> sampler)
: param_{param},
: ctx_{ctx},
param_{param},
column_sampler_{std::move(sampler)},
tree_evaluator_{param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
n_threads_{n_threads} {
interaction_constraints_.Configure(param, info.num_col_);
column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(), param_.colsample_bynode,
param_.colsample_bylevel, param_.colsample_bytree);
tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
is_col_split_{info.data_split_mode == DataSplitMode::kCol} {
interaction_constraints_.Configure(*param, info.num_col_);
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
param_->colsample_bynode, param_->colsample_bylevel,
param_->colsample_bytree);
}
};
@@ -467,6 +487,5 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
});
}
}
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_

View File

@@ -29,6 +29,7 @@ class HistogramBuilder {
size_t n_batches_{0};
// Whether XGBoost is running in distributed environment.
bool is_distributed_{false};
bool is_col_split_{false};
public:
/**
@@ -40,7 +41,7 @@ class HistogramBuilder {
* of using global rabit variable.
*/
void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
bool is_distributed) {
bool is_distributed, bool is_col_split) {
CHECK_GE(n_threads, 1);
n_threads_ = n_threads;
n_batches_ = n_batches;
@@ -50,6 +51,7 @@ class HistogramBuilder {
buffer_.Init(total_bins);
builder_ = common::GHistBuilder(total_bins);
is_distributed_ = is_distributed;
is_col_split_ = is_col_split;
// Workaround s390x gcc 7.5.0
auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
}
@@ -96,7 +98,7 @@ class HistogramBuilder {
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
RegTree const *p_tree) {
if (is_distributed_) {
if (is_distributed_ && !is_col_split_) {
this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);
} else {
@@ -130,7 +132,7 @@ class HistogramBuilder {
return;
}
if (is_distributed_) {
if (is_distributed_ && !is_col_split_) {
this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick,
starting_index, sync_count);

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2021 by Contributors
/**
* Copyright 2014-2023 by XGBoost Contributors
* \file param.h
* \brief training parameters, statistics used to support tree construction.
* \author Tianqi Chen
@@ -238,9 +238,8 @@ XGBOOST_DEVICE inline static T1 ThresholdL1(T1 w, T2 alpha) {
// calculate the cost of loss function
template <typename TrainingParams, typename T>
XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p,
T sum_grad, T sum_hess, T w) {
return -(T(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad, T sum_hess, T w) {
return -(static_cast<T>(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
}
// calculate weight given the statistics
@@ -261,7 +260,7 @@ XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
template <typename TrainingParams, typename T>
XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
if (sum_hess < p.min_child_weight || sum_hess <= 0.0) {
return T(0.0);
return static_cast<T>(0.0);
}
if (p.max_delta_step == 0.0f) {
if (p.reg_alpha == 0.0f) {

View File

@@ -1069,8 +1069,8 @@ bool LoadModelImpl(Json const& in, TreeParam* param, std::vector<RTreeNodeStat>*
split_types = std::remove_reference_t<decltype(split_types)>(n_nodes);
split_categories_segments = std::remove_reference_t<decltype(split_categories_segments)>(n_nodes);
static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value, "");
static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value, "");
static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value);
static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value);
CHECK_EQ(n_nodes, split_categories_segments.size());
// Set node

View File

@@ -23,8 +23,7 @@
#include "xgboost/tree_model.h"
#include "xgboost/tree_updater.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_approx);
@@ -41,7 +40,7 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
class GloablApproxBuilder {
protected:
TrainParam param_;
TrainParam const* param_;
std::shared_ptr<common::ColumnSampler> col_sampler_;
HistEvaluator<CPUExpandEntry> evaluator_;
HistogramBuilder<CPUExpandEntry> histogram_builder_;
@@ -64,19 +63,19 @@ class GloablApproxBuilder {
bst_bin_t n_total_bins = 0;
partitioner_.clear();
// Generating the GHistIndexMatrix is quite slow, is there a way to speed it up?
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess, task_))) {
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess, task_))) {
if (n_total_bins == 0) {
n_total_bins = page.cut.TotalBins();
feature_values_ = page.cut;
} else {
CHECK_EQ(n_total_bins, page.cut.TotalBins());
}
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid);
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
n_batches_++;
}
histogram_builder_.Reset(n_total_bins, BatchSpec(param_, hess), ctx_->Threads(), n_batches_,
collective::IsDistributed());
histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
collective::IsDistributed(), p_fmat->IsColumnSplit());
monitor_->Stop(__func__);
}
@@ -90,11 +89,13 @@ class GloablApproxBuilder {
for (auto const &g : gpair) {
root_sum.Add(g);
}
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
if (p_fmat->IsRowSplit()) {
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
}
std::vector<CPUExpandEntry> nodes{best};
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
{}, gpair);
i++;
@@ -103,7 +104,7 @@ class GloablApproxBuilder {
auto weight = evaluator_.InitRoot(root_sum);
p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
p_tree->Stat(RegTree::kRoot).base_weight = weight;
(*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight);
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
auto const &histograms = histogram_builder_.Histogram();
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
@@ -145,7 +146,7 @@ class GloablApproxBuilder {
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes_to_build, nodes_to_sub, gpair);
i++;
@@ -166,12 +167,12 @@ class GloablApproxBuilder {
}
public:
explicit GloablApproxBuilder(TrainParam param, MetaInfo const &info, Context const *ctx,
explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
std::shared_ptr<common::ColumnSampler> column_sampler, ObjInfo task,
common::Monitor *monitor)
: param_{std::move(param)},
: param_{param},
col_sampler_{std::move(column_sampler)},
evaluator_{param_, info, ctx->Threads(), col_sampler_},
evaluator_{ctx, param_, info, col_sampler_},
ctx_{ctx},
task_{task},
monitor_{monitor} {}
@@ -181,7 +182,7 @@ class GloablApproxBuilder {
p_last_tree_ = p_tree;
this->InitData(p_fmat, hess);
Driver<CPUExpandEntry> driver(param_);
Driver<CPUExpandEntry> driver(*param_);
auto &tree = *p_tree;
driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)});
auto expand_set = driver.Pop();
@@ -211,7 +212,7 @@ class GloablApproxBuilder {
monitor_->Start("UpdatePosition");
size_t page_id = 0;
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
page_id++;
}
@@ -248,7 +249,6 @@ class GloablApproxBuilder {
* iteration.
*/
class GlobalApproxUpdater : public TreeUpdater {
TrainParam param_;
common::Monitor monitor_;
// specializations for different histogram precision.
std::unique_ptr<GloablApproxBuilder> pimpl_;
@@ -263,15 +263,9 @@ class GlobalApproxUpdater : public TreeUpdater {
monitor_.Init(__func__);
}
void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
void LoadConfig(Json const &in) override {
auto const &config = get<Object const>(in);
FromJson(config.at("train_param"), &this->param_);
}
void SaveConfig(Json *p_out) const override {
auto &out = *p_out;
out["train_param"] = ToJson(param_);
}
void Configure(Args const &) override {}
void LoadConfig(Json const &) override {}
void SaveConfig(Json *) const override {}
void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
linalg::Matrix<GradientPair> *sampled) {
@@ -281,20 +275,17 @@ class GlobalApproxUpdater : public TreeUpdater {
SampleGradient(ctx_, param, sampled->HostView());
}
char const *Name() const override { return "grow_histmaker"; }
[[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *m,
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) override {
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();
pimpl_ = std::make_unique<GloablApproxBuilder>(param_, m->Info(), ctx_, column_sampler_, task_,
pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
&monitor_);
linalg::Matrix<GradientPair> h_gpair;
// Obtain the hessian values for weighted sketching
InitData(param_, gpair, &h_gpair);
InitData(*param, gpair, &h_gpair);
std::vector<float> hess(h_gpair.Size());
auto const &s_gpair = h_gpair.Data()->ConstHostVector();
std::transform(s_gpair.begin(), s_gpair.end(), hess.begin(),
@@ -302,12 +293,11 @@ class GlobalApproxUpdater : public TreeUpdater {
cached_ = m;
size_t t_idx = 0;
std::size_t t_idx = 0;
for (auto p_tree : trees) {
this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
++t_idx;
}
param_.learning_rate = lr;
}
bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
@@ -318,7 +308,7 @@ class GlobalApproxUpdater : public TreeUpdater {
return true;
}
bool HasNodePosition() const override { return true; }
[[nodiscard]] bool HasNodePosition() const override { return true; }
};
DMLC_REGISTRY_FILE_TAG(grow_histmaker);
@@ -328,5 +318,4 @@ XGBOOST_REGISTER_TREE_UPDATER(GlobalHistMaker, "grow_histmaker")
"Tree constructor that uses approximate histogram construction "
"for each node.")
.set_body([](Context const *ctx, ObjInfo task) { return new GlobalApproxUpdater(ctx, task); });
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2022 by XGBoost Contributors
/**
* Copyright 2014-2023 by XGBoost Contributors
* \file updater_colmaker.cc
* \brief use columnwise update to construct a tree
* \author Tianqi Chen
@@ -17,8 +17,7 @@
#include "../common/random.h"
#include "split_evaluator.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_colmaker);
@@ -57,18 +56,15 @@ class ColMaker: public TreeUpdater {
public:
explicit ColMaker(Context const *ctx) : TreeUpdater(ctx) {}
void Configure(const Args &args) override {
param_.UpdateAllowUnknown(args);
colmaker_param_.UpdateAllowUnknown(args);
}
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
FromJson(config.at("train_param"), &this->param_);
FromJson(config.at("colmaker_train_param"), &this->colmaker_param_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["train_param"] = ToJson(param_);
void SaveConfig(Json *p_out) const override {
auto &out = *p_out;
out["colmaker_train_param"] = ToJson(colmaker_param_);
}
@@ -95,7 +91,7 @@ class ColMaker: public TreeUpdater {
}
}
void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree *> &trees) override {
if (collective::IsDistributed()) {
@@ -108,22 +104,16 @@ class ColMaker: public TreeUpdater {
}
this->LazyGetColumnDensity(dmat);
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();
interaction_constraints_.Configure(param_, dmat->Info().num_row_);
interaction_constraints_.Configure(*param, dmat->Info().num_row_);
// build tree
for (auto tree : trees) {
CHECK(ctx_);
Builder builder(param_, colmaker_param_, interaction_constraints_, ctx_,
column_densities_);
Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
builder.Update(gpair->ConstHostVector(), dmat, tree);
}
param_.learning_rate = lr;
}
protected:
// training parameter
TrainParam param_;
ColMakerTrainParam colmaker_param_;
// SplitEvaluator that will be cloned for each Builder
std::vector<float> column_densities_;
@@ -234,9 +224,9 @@ class ColMaker: public TreeUpdater {
}
}
{
column_sampler_.Init(fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
param_.colsample_bynode, param_.colsample_bylevel,
param_.colsample_bytree);
column_sampler_.Init(ctx_, fmat.Info().num_col_,
fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
param_.colsample_bylevel, param_.colsample_bytree);
}
{
// setup temp space for each thread
@@ -614,5 +604,4 @@ class ColMaker: public TreeUpdater {
XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
.describe("Grow tree with parallelization over columns.")
.set_body([](Context const *ctx, ObjInfo) { return new ColMaker(ctx); });
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2017-2022 XGBoost contributors
/**
* Copyright 2017-2023 by XGBoost contributors
*/
#include <thrust/copy.h>
#include <thrust/reduce.h>
@@ -160,11 +160,11 @@ class DeviceHistogramStorage {
if (nidx_map_.find(nidx) != nidx_map_.cend()) {
// Fetch from normal cache
auto ptr = data_.data().get() + nidx_map_.at(nidx);
return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
} else {
// Fetch from overflow
auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
}
}
};
@@ -243,7 +243,7 @@ struct GPUHistMakerDevice {
// thread safe
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
auto const& info = dmat->Info();
this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
@@ -306,6 +306,8 @@ struct GPUHistMakerDevice {
matrix.is_dense
};
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
// Store the feature set ptrs so they dont go out of scope before the kernel is called
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
for (size_t i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild();
@@ -314,10 +316,12 @@ struct GPUHistMakerDevice {
nidx[i * 2 + 1] = right_nidx;
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
feature_sets.emplace_back(left_sampled_features);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
feature_sets.emplace_back(right_sampled_features);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
right_nidx);
@@ -330,8 +334,8 @@ struct GPUHistMakerDevice {
}
bst_feature_t max_active_features = 0;
for (auto input : h_node_inputs) {
max_active_features = std::max(max_active_features,
bst_feature_t(input.feature_set.size()));
max_active_features =
std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
}
dh::safe_cuda(cudaMemcpyAsync(
d_node_inputs.data().get(), h_node_inputs.data(),
@@ -752,7 +756,6 @@ class GPUHistMaker : public TreeUpdater {
void Configure(const Args& args) override {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Hist]: Configure";
param_.UpdateAllowUnknown(args);
hist_maker_param_.UpdateAllowUnknown(args);
dh::CheckComputeCapability();
initialised_ = false;
@@ -764,32 +767,26 @@ class GPUHistMaker : public TreeUpdater {
auto const& config = get<Object const>(in);
FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
initialised_ = false;
FromJson(config.at("train_param"), &param_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
out["train_param"] = ToJson(param_);
}
~GPUHistMaker() { // NOLINT
dh::GlobalMemoryLogger().Log();
}
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override {
monitor_.Start("Update");
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();
// build tree
try {
size_t t_idx{0};
for (xgboost::RegTree* tree : trees) {
this->UpdateTree(gpair, dmat, tree, &out_position[t_idx]);
this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
if (hist_maker_param_.debug_synchronize) {
this->CheckTreesSynchronized(tree);
@@ -800,12 +797,10 @@ class GPUHistMaker : public TreeUpdater {
} catch (const std::exception& e) {
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
}
param_.learning_rate = lr;
monitor_.Stop("Update");
}
void InitDataOnce(DMatrix* dmat) {
void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
info_ = &dmat->Info();
@@ -814,24 +809,24 @@ class GPUHistMaker : public TreeUpdater {
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
BatchParam batch_param{
ctx_->gpu_id,
param_.max_bin,
ctx_->gpu_id,
param->max_bin,
};
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
info_->feature_types.SetDevice(ctx_->gpu_id);
maker.reset(new GPUHistMakerDevice<GradientSumT>(
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_,
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
column_sampling_seed, info_->num_col_, batch_param));
p_last_fmat_ = dmat;
initialised_ = true;
}
void InitData(DMatrix* dmat, RegTree const* p_tree) {
void InitData(TrainParam const* param, DMatrix* dmat, RegTree const* p_tree) {
if (!initialised_) {
monitor_.Start("InitDataOnce");
this->InitDataOnce(dmat);
this->InitDataOnce(param, dmat);
monitor_.Stop("InitDataOnce");
}
p_last_tree_ = p_tree;
@@ -852,10 +847,10 @@ class GPUHistMaker : public TreeUpdater {
CHECK(*local_tree == reference_tree);
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
HostDeviceVector<bst_node_t>* p_out_position) {
void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
monitor_.Start("InitData");
this->InitData(p_fmat, p_tree);
this->InitData(param, p_fmat, p_tree);
monitor_.Stop("InitData");
gpair->SetDevice(ctx_->gpu_id);
@@ -874,7 +869,6 @@ class GPUHistMaker : public TreeUpdater {
return result;
}
TrainParam param_; // NOLINT
MetaInfo* info_{}; // NOLINT
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2022 by XGBoost Contributors
/**
* Copyright 2014-2023 by XGBoost Contributors
* \file updater_prune.cc
* \brief prune a tree given the statistics
* \author Tianqi Chen
@@ -8,13 +8,11 @@
#include <memory>
#include "../common/timer.h"
#include "./param.h"
#include "xgboost/base.h"
#include "xgboost/json.h"
#include "./param.h"
#include "../common/timer.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_prune);
/*! \brief pruner that prunes a tree after growing finishes */
@@ -24,47 +22,31 @@ class TreePruner : public TreeUpdater {
syncher_.reset(TreeUpdater::Create("sync", ctx_, task));
pruner_monitor_.Init("TreePruner");
}
char const* Name() const override {
return "prune";
}
[[nodiscard]] char const* Name() const override { return "prune"; }
// set training parameter
void Configure(const Args& args) override {
param_.UpdateAllowUnknown(args);
syncher_->Configure(args);
}
void Configure(const Args& args) override { syncher_->Configure(args); }
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
FromJson(config.at("train_param"), &this->param_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["train_param"] = ToJson(param_);
}
bool CanModifyTree() const override {
return true;
}
void LoadConfig(Json const&) override {}
void SaveConfig(Json*) const override {}
[[nodiscard]] bool CanModifyTree() const override { return true; }
// update the tree, do pruning
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override {
pruner_monitor_.Start("PrunerUpdate");
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();
for (auto tree : trees) {
this->DoPrune(tree);
this->DoPrune(param, tree);
}
param_.learning_rate = lr;
syncher_->Update(gpair, p_fmat, out_position, trees);
syncher_->Update(param, gpair, p_fmat, out_position, trees);
pruner_monitor_.Stop("PrunerUpdate");
}
private:
// try to prune off current leaf
bst_node_t TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*)
bst_node_t TryPruneLeaf(TrainParam const* param, RegTree* p_tree, int nid, int depth,
int npruned) {
auto& tree = *p_tree;
CHECK(tree[nid].IsLeaf());
if (tree[nid].IsRoot()) {
return npruned;
@@ -77,22 +59,22 @@ class TreePruner : public TreeUpdater {
auto right = tree[pid].RightChild();
bool balanced = tree[left].IsLeaf() &&
right != RegTree::kInvalidNodeId && tree[right].IsLeaf();
if (balanced && param_.NeedPrune(s.loss_chg, depth)) {
if (balanced && param->NeedPrune(s.loss_chg, depth)) {
// need to be pruned
tree.ChangeToLeaf(pid, param_.learning_rate * s.base_weight);
tree.ChangeToLeaf(pid, param->learning_rate * s.base_weight);
// tail recursion
return this->TryPruneLeaf(tree, pid, depth - 1, npruned + 2);
return this->TryPruneLeaf(param, p_tree, pid, depth - 1, npruned + 2);
} else {
return npruned;
}
}
/*! \brief do pruning of a tree */
void DoPrune(RegTree* p_tree) {
void DoPrune(TrainParam const* param, RegTree* p_tree) {
auto& tree = *p_tree;
bst_node_t npruned = 0;
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
}
}
LOG(INFO) << "tree pruning end, "
@@ -103,13 +85,10 @@ class TreePruner : public TreeUpdater {
private:
// synchronizer
std::unique_ptr<TreeUpdater> syncher_;
// training parameter
TrainParam param_;
common::Monitor pruner_monitor_;
};
XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune")
.describe("Pruner that prune the tree according to statistics.")
.set_body([](Context const* ctx, ObjInfo task) { return new TreePruner(ctx, task); });
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -28,21 +28,14 @@ namespace tree {
DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
void QuantileHistMaker::Configure(const Args &args) {
param_.UpdateAllowUnknown(args);
}
void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
DMatrix *dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) {
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();
// build tree
const size_t n_trees = trees.size();
if (!pimpl_) {
pimpl_.reset(new Builder(n_trees, param_, dmat, task_, ctx_));
pimpl_.reset(new Builder(n_trees, param, dmat, task_, ctx_));
}
size_t t_idx{0};
@@ -51,8 +44,6 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *d
this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
++t_idx;
}
param_.learning_rate = lr;
}
bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
@@ -107,7 +98,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
auto weight = evaluator_->InitRoot(GradStats{grad_stat});
p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
p_tree->Stat(RegTree::kRoot).base_weight = weight;
(*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight);
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
std::vector<CPUExpandEntry> entries{node};
monitor_->Start("EvaluateSplits");
@@ -173,7 +164,7 @@ void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
HostDeviceVector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
Driver<CPUExpandEntry> driver(param_);
Driver<CPUExpandEntry> driver(*param_);
driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
auto const &tree = *p_tree;
auto expand_set = driver.Pop();
@@ -277,21 +268,19 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
} else {
CHECK_EQ(n_total_bins, page.cut.TotalBins());
}
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid);
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, fmat->IsColumnSplit());
++page_id;
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed());
collective::IsDistributed(), fmat->IsColumnSplit());
auto m_gpair =
linalg::MakeTensorView(*gpair, {gpair->size(), static_cast<std::size_t>(1)}, ctx_->gpu_id);
SampleGradient(ctx_, param_, m_gpair);
auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
SampleGradient(ctx_, *param_, m_gpair);
}
// store a pointer to the tree
p_last_tree_ = &tree;
evaluator_.reset(
new HistEvaluator<CPUExpandEntry>{param_, info, this->ctx_->Threads(), column_sampler_});
evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});
monitor_->Stop(__func__);
}

View File

@@ -35,49 +35,36 @@
#include "../common/partition_builder.h"
#include "../common/column_matrix.h"
namespace xgboost {
namespace tree {
inline BatchParam HistBatch(TrainParam const& param) {
return {param.max_bin, param.sparse_threshold};
namespace xgboost::tree {
inline BatchParam HistBatch(TrainParam const* param) {
return {param->max_bin, param->sparse_threshold};
}
/*! \brief construct a tree using quantized feature values */
class QuantileHistMaker: public TreeUpdater {
public:
explicit QuantileHistMaker(Context const* ctx, ObjInfo task) : TreeUpdater(ctx), task_{task} {}
void Configure(const Args& args) override;
void Configure(const Args&) override {}
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override;
bool UpdatePredictionCache(const DMatrix *data,
linalg::VectorView<float> out_preds) override;
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
FromJson(config.at("train_param"), &this->param_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["train_param"] = ToJson(param_);
}
void LoadConfig(Json const&) override {}
void SaveConfig(Json*) const override {}
char const* Name() const override {
return "grow_quantile_histmaker";
}
bool HasNodePosition() const override { return true; }
[[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
[[nodiscard]] bool HasNodePosition() const override { return true; }
protected:
// training parameter
TrainParam param_;
// actual builder that runs the algorithm
struct Builder {
public:
// constructor
explicit Builder(const size_t n_trees, const TrainParam& param, DMatrix const* fmat,
explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
ObjInfo task, Context const* ctx)
: n_trees_(n_trees),
param_(param),
@@ -115,7 +102,7 @@ class QuantileHistMaker: public TreeUpdater {
private:
const size_t n_trees_;
const TrainParam& param_;
TrainParam const* param_;
std::shared_ptr<common::ColumnSampler> column_sampler_{
std::make_shared<common::ColumnSampler>()};
@@ -140,7 +127,6 @@ class QuantileHistMaker: public TreeUpdater {
std::unique_ptr<Builder> pimpl_;
ObjInfo task_;
};
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree
#endif // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2022 by XGBoost Contributors
/**
* Copyright 2014-2023 by XGBoost Contributors
* \file updater_refresh.cc
* \brief refresh the statistics and leaf value on the tree on the dataset
* \author Tianqi Chen
@@ -16,8 +16,7 @@
#include "./param.h"
#include "xgboost/json.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_refresh);
@@ -25,23 +24,14 @@ DMLC_REGISTRY_FILE_TAG(updater_refresh);
class TreeRefresher : public TreeUpdater {
public:
explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}
void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
FromJson(config.at("train_param"), &this->param_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["train_param"] = ToJson(param_);
}
char const* Name() const override {
return "refresh";
}
bool CanModifyTree() const override {
return true;
}
void Configure(const Args &) override {}
void LoadConfig(Json const &) override {}
void SaveConfig(Json *) const override {}
[[nodiscard]] char const *Name() const override { return "refresh"; }
[[nodiscard]] bool CanModifyTree() const override { return true; }
// update the tree, do pruning
void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree *> &trees) override {
if (trees.size() == 0) return;
@@ -103,16 +93,11 @@ class TreeRefresher : public TreeUpdater {
lazy_get_stats();
collective::Allreduce<collective::Operation::kSum>(&dmlc::BeginPtr(stemp[0])->sum_grad,
stemp[0].size() * 2);
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();
int offset = 0;
for (auto tree : trees) {
this->Refresh(dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
offset += tree->param.num_nodes;
}
// set learning rate back
param_.learning_rate = lr;
}
private:
@@ -135,31 +120,27 @@ class TreeRefresher : public TreeUpdater {
gstats[pid].Add(gpair[ridx]);
}
}
inline void Refresh(const GradStats *gstats,
int nid, RegTree *p_tree) {
inline void Refresh(TrainParam const *param, const GradStats *gstats, int nid, RegTree *p_tree) {
RegTree &tree = *p_tree;
tree.Stat(nid).base_weight =
static_cast<bst_float>(CalcWeight(param_, gstats[nid]));
static_cast<bst_float>(CalcWeight(*param, gstats[nid]));
tree.Stat(nid).sum_hess = static_cast<bst_float>(gstats[nid].sum_hess);
if (tree[nid].IsLeaf()) {
if (param_.refresh_leaf) {
tree[nid].SetLeaf(tree.Stat(nid).base_weight * param_.learning_rate);
if (param->refresh_leaf) {
tree[nid].SetLeaf(tree.Stat(nid).base_weight * param->learning_rate);
}
} else {
tree.Stat(nid).loss_chg = static_cast<bst_float>(
xgboost::tree::CalcGain(param_, gstats[tree[nid].LeftChild()]) +
xgboost::tree::CalcGain(param_, gstats[tree[nid].RightChild()]) -
xgboost::tree::CalcGain(param_, gstats[nid]));
this->Refresh(gstats, tree[nid].LeftChild(), p_tree);
this->Refresh(gstats, tree[nid].RightChild(), p_tree);
tree.Stat(nid).loss_chg =
static_cast<bst_float>(xgboost::tree::CalcGain(*param, gstats[tree[nid].LeftChild()]) +
xgboost::tree::CalcGain(*param, gstats[tree[nid].RightChild()]) -
xgboost::tree::CalcGain(*param, gstats[nid]));
this->Refresh(param, gstats, tree[nid].LeftChild(), p_tree);
this->Refresh(param, gstats, tree[nid].RightChild(), p_tree);
}
}
// training parameter
TrainParam param_;
};
XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")
.describe("Refresher that refreshes the weight and statistics according to data.")
.set_body([](Context const *ctx, ObjInfo) { return new TreeRefresher(ctx); });
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2019 by Contributors
/**
* Copyright 2014-2013 by XBGoost Contributors
* \file updater_sync.cc
* \brief synchronize the tree in all distributed nodes
*/
@@ -13,8 +13,7 @@
#include "../common/io.h"
#include "xgboost/json.h"
namespace xgboost {
namespace tree {
namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_sync);
@@ -30,11 +29,9 @@ class TreeSyncher : public TreeUpdater {
void LoadConfig(Json const&) override {}
void SaveConfig(Json*) const override {}
char const* Name() const override {
return "prune";
}
[[nodiscard]] char const* Name() const override { return "prune"; }
void Update(HostDeviceVector<GradientPair>*, DMatrix*,
void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree*>& trees) override {
if (collective::GetWorldSize() == 1) return;
@@ -57,5 +54,4 @@ class TreeSyncher : public TreeUpdater {
XGBOOST_REGISTER_TREE_UPDATER(TreeSyncher, "sync")
.describe("Syncher that synchronize the tree in all distributed nodes.")
.set_body([](Context const* ctx, ObjInfo) { return new TreeSyncher(ctx); });
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree