Merge branch 'master' into dev-hui
This commit is contained in:
@@ -1,10 +1,32 @@
|
||||
/*!
|
||||
* Copyright 2022 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2022-2023 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_ALGORITHM_H_
|
||||
#define XGBOOST_COMMON_ALGORITHM_H_
|
||||
#include <algorithm> // std::upper_bound
|
||||
#include <cinttypes> // std::size_t
|
||||
#include <algorithm> // upper_bound, stable_sort, sort, max
|
||||
#include <cinttypes> // size_t
|
||||
#include <functional> // less
|
||||
#include <iterator> // iterator_traits, distance
|
||||
#include <vector> // vector
|
||||
|
||||
#include "numeric.h" // Iota
|
||||
#include "xgboost/context.h" // Context
|
||||
|
||||
// clang with libstdc++ works as well
|
||||
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
|
||||
!defined(__APPLE__) && __has_include(<omp.h>)
|
||||
#define GCC_HAS_PARALLEL 1
|
||||
#endif // GLIC_VERSION
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||
#define MSVC_HAS_PARALLEL 1
|
||||
#endif // MSC
|
||||
|
||||
#if defined(GCC_HAS_PARALLEL)
|
||||
#include <parallel/algorithm>
|
||||
#elif defined(MSVC_HAS_PARALLEL)
|
||||
#include <ppl.h>
|
||||
#endif // GLIBC VERSION
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -13,6 +35,63 @@ auto SegmentId(It first, It last, Idx idx) {
|
||||
std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first;
|
||||
return segment_id;
|
||||
}
|
||||
|
||||
template <typename Iter, typename Comp>
|
||||
void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) {
|
||||
if (ctx->Threads() > 1) {
|
||||
#if defined(GCC_HAS_PARALLEL)
|
||||
__gnu_parallel::stable_sort(begin, end, comp,
|
||||
__gnu_parallel::default_parallel_tag(ctx->Threads()));
|
||||
#else
|
||||
// the only stable sort is radix sort for msvc ppl.
|
||||
std::stable_sort(begin, end, comp);
|
||||
#endif // GLIBC VERSION
|
||||
} else {
|
||||
std::stable_sort(begin, end, comp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Iter, typename Comp>
|
||||
void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
|
||||
if (ctx->Threads() > 1) {
|
||||
#if defined(GCC_HAS_PARALLEL)
|
||||
__gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads()));
|
||||
#elif defined(MSVC_HAS_PARALLEL)
|
||||
auto n = std::distance(begin, end);
|
||||
// use chunk size as hint to number of threads. No local policy/scheduler input with the
|
||||
// concurrency module.
|
||||
std::size_t chunk_size = n / ctx->Threads();
|
||||
// 2048 is the default of msvc ppl as of v2022.
|
||||
chunk_size = std::max(chunk_size, static_cast<std::size_t>(2048));
|
||||
concurrency::parallel_sort(begin, end, comp, chunk_size);
|
||||
#else
|
||||
std::sort(begin, end, comp);
|
||||
#endif // GLIBC VERSION
|
||||
} else {
|
||||
std::sort(begin, end, comp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
|
||||
typename Comp = std::less<V>>
|
||||
std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
|
||||
CHECK(ctx->IsCPU());
|
||||
auto n = std::distance(begin, end);
|
||||
std::vector<Idx> result(n);
|
||||
Iota(ctx, result.begin(), result.end(), 0);
|
||||
auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); };
|
||||
StableSort(ctx, result.begin(), result.end(), op);
|
||||
return result;
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
#if defined(GCC_HAS_PARALLEL)
|
||||
#undef GCC_HAS_PARALLEL
|
||||
#endif // defined(GCC_HAS_PARALLEL)
|
||||
|
||||
#if defined(MSVC_HAS_PARALLEL)
|
||||
#undef MSVC_HAS_PARALLEL
|
||||
#endif // defined(MSVC_HAS_PARALLEL)
|
||||
|
||||
#endif // XGBOOST_COMMON_ALGORITHM_H_
|
||||
|
||||
@@ -42,9 +42,9 @@ constexpr inline bst_cat_t OutOfRangeCat() {
|
||||
|
||||
inline XGBOOST_DEVICE bool InvalidCat(float cat) {
|
||||
constexpr auto kMaxCat = OutOfRangeCat();
|
||||
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
|
||||
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
|
||||
static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
|
||||
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat);
|
||||
static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1);
|
||||
static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat);
|
||||
return cat < 0 || cat >= kMaxCat;
|
||||
}
|
||||
|
||||
|
||||
@@ -270,7 +270,9 @@ struct RyuPowLogUtils {
|
||||
*/
|
||||
static uint32_t MulPow5InvDivPow2(const uint32_t m, const uint32_t q,
|
||||
const int32_t j) noexcept(true) {
|
||||
return MulShift(m, kFloatPow5InvSplit[q], j);
|
||||
static_assert(sizeof(kFloatPow5InvSplit) == 55 * sizeof(std::uint64_t));
|
||||
assert(q < 55);
|
||||
return MulShift(m, kFloatPow5InvSplit[q], j); // NOLINT
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -495,12 +497,10 @@ class PowerBaseComputer {
|
||||
static_cast<int32_t>(IEEE754::kFloatBias) -
|
||||
static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
|
||||
static_cast<int32_t>(2);
|
||||
static_assert(static_cast<int32_t>(1) -
|
||||
static_cast<int32_t>(IEEE754::kFloatBias) -
|
||||
static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
|
||||
static_cast<int32_t>(2) ==
|
||||
-151,
|
||||
"");
|
||||
static_assert(static_cast<int32_t>(1) - static_cast<int32_t>(IEEE754::kFloatBias) -
|
||||
static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
|
||||
static_cast<int32_t>(2) ==
|
||||
-151);
|
||||
mantissa_base2 = f.mantissa;
|
||||
} else {
|
||||
base2_range.exponent = static_cast<int32_t>(f.exponent) - IEEE754::kFloatBias -
|
||||
@@ -544,7 +544,7 @@ class RyuPrinter {
|
||||
// Function precondition: v is not a 10-digit number.
|
||||
// (f2s: 9 digits are sufficient for round-tripping.)
|
||||
// (d2fixed: We print 9-digit blocks.)
|
||||
static_assert(100000000 == Tens(8), "");
|
||||
static_assert(100000000 == Tens(8));
|
||||
assert(v < Tens(9));
|
||||
if (v >= Tens(8)) {
|
||||
return 9;
|
||||
@@ -911,7 +911,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
|
||||
// the bias and also special-case the value 0.
|
||||
int32_t shift = (f_e2 == 0 ? 1 : f_e2) - exp_b2 - IEEE754::kFloatBias -
|
||||
IEEE754::kFloatMantissaBits;
|
||||
assert(shift >= 0);
|
||||
assert(shift >= 1);
|
||||
|
||||
// We need to round up if the exact value is more than 0.5 above the value we
|
||||
// computed. That's equivalent to checking if the last removed bit was 1 and
|
||||
@@ -920,7 +920,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
|
||||
//
|
||||
// We need to update trailingZeros given that we have the exact output
|
||||
// exponent ieee_e2 now.
|
||||
trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;
|
||||
trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0; // NOLINT
|
||||
uint32_t lastRemovedBit = (mantissa_b2 >> (shift - 1)) & 1;
|
||||
bool roundup = (lastRemovedBit != 0) &&
|
||||
(!trailing_zeros || (((mantissa_b2 >> shift) & 1) != 0));
|
||||
|
||||
@@ -87,7 +87,7 @@ inline to_chars_result to_chars(char *first, char *last, int64_t value) { // NOL
|
||||
if (value < 0) {
|
||||
*first = '-';
|
||||
std::advance(first, 1);
|
||||
unsigned_value = uint64_t(~value) + uint64_t(1);
|
||||
unsigned_value = static_cast<uint64_t>(~value) + static_cast<uint64_t>(1);
|
||||
}
|
||||
return detail::ToCharsUnsignedImpl(first, last, unsigned_value);
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
|
||||
feature_offsets_[fid] = accum_index;
|
||||
}
|
||||
|
||||
SetTypeSize(gmat.max_num_bins);
|
||||
SetTypeSize(gmat.MaxNumBinPerFeat());
|
||||
auto storage_size =
|
||||
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
||||
index_.resize(storage_size, 0);
|
||||
|
||||
@@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) {
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename Idx, typename Container,
|
||||
typename V = typename Container::value_type,
|
||||
typename Comp = std::less<V>>
|
||||
std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
|
||||
std::vector<Idx> result(array.size());
|
||||
std::iota(result.begin(), result.end(), 0);
|
||||
auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
|
||||
XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Last index of a group in a CSR style of index pointer.
|
||||
*/
|
||||
@@ -206,31 +195,6 @@ template <typename Indexable>
|
||||
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
|
||||
return indptr[group + 1] - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief A CRTP (curiously recurring template pattern) helper function.
|
||||
*
|
||||
* https://www.fluentcpp.com/2017/05/19/crtp-helper/
|
||||
*
|
||||
* Does two things:
|
||||
* 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
|
||||
* 2. Avoids having to `static_cast` in a lot of places.
|
||||
*
|
||||
* \tparam T The derived class in a CRTP hierarchy.
|
||||
*/
|
||||
template <typename T>
|
||||
struct Crtp {
|
||||
T &Underlying() { return static_cast<T &>(*this); }
|
||||
T const &Underlying() const { return static_cast<T const &>(*this); }
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief C++17 std::as_const
|
||||
*/
|
||||
template <typename T>
|
||||
typename std::add_const<T>::type &AsConst(T &v) noexcept { // NOLINT(runtime/references)
|
||||
return v;
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_COMMON_H_
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
/*!
|
||||
* Copyright 2017 by Contributors
|
||||
/**
|
||||
* Copyright 2017-2023 by XGBoost Contributors
|
||||
* \file compressed_iterator.h
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/base.h>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -36,7 +37,7 @@ static const int kPadding = 4; // Assign padding so we can read slightly off
|
||||
// The number of bits required to represent a given unsigned range
|
||||
inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) {
|
||||
auto bits = std::ceil(log2(static_cast<double>(num_symbols)));
|
||||
return common::Max(static_cast<size_t>(bits), size_t(1));
|
||||
return common::Max(static_cast<size_t>(bits), static_cast<std::size_t>(1));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cstddef> // for size_t
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_allocator.cuh>
|
||||
#include <numeric>
|
||||
@@ -178,7 +179,7 @@ inline size_t MaxSharedMemory(int device_idx) {
|
||||
dh::safe_cuda(cudaDeviceGetAttribute
|
||||
(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
|
||||
device_idx));
|
||||
return size_t(max_shared_memory);
|
||||
return static_cast<std::size_t>(max_shared_memory);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -195,7 +196,7 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
|
||||
dh::safe_cuda(cudaDeviceGetAttribute
|
||||
(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
|
||||
device_idx));
|
||||
return size_t(max_shared_memory);
|
||||
return static_cast<std::size_t>(max_shared_memory);
|
||||
}
|
||||
|
||||
inline void CheckComputeCapability() {
|
||||
|
||||
@@ -46,7 +46,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
|
||||
if (!use_sorted) {
|
||||
HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
|
||||
HostSketchContainer::UseGroup(info),
|
||||
m->Info().data_split_mode == DataSplitMode::kCol, n_threads);
|
||||
m->IsColumnSplit(), n_threads);
|
||||
for (auto const& page : m->GetBatches<SparsePage>()) {
|
||||
container.PushRowPage(page, info, hessian);
|
||||
}
|
||||
@@ -54,7 +54,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
|
||||
} else {
|
||||
SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
|
||||
HostSketchContainer::UseGroup(info),
|
||||
m->Info().data_split_mode == DataSplitMode::kCol, n_threads};
|
||||
m->IsColumnSplit(), n_threads};
|
||||
for (auto const& page : m->GetBatches<SortedCSCPage>()) {
|
||||
container.PushColPage(page, info, hessian);
|
||||
}
|
||||
|
||||
@@ -1,33 +1,31 @@
|
||||
/*!
|
||||
* Copyright 2018~2020 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2018~2023 by XGBoost contributors
|
||||
*/
|
||||
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <thrust/binary_search.h>
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/functional.h>
|
||||
#include <thrust/iterator/counting_iterator.h>
|
||||
#include <thrust/iterator/transform_iterator.h>
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/iterator/transform_iterator.h>
|
||||
#include <thrust/reduce.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/binary_search.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "categorical.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "hist_util.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "math.h" // NOLINT
|
||||
#include "quantile.h"
|
||||
#include "categorical.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
@@ -318,7 +316,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
|
||||
size_t batch_nnz = batch.data.Size();
|
||||
auto const& info = dmat->Info();
|
||||
for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
|
||||
size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
|
||||
size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
if (has_weights) {
|
||||
bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
|
||||
dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2020 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2020-2023 by XGBoost contributors
|
||||
*
|
||||
* \brief Front end and utilities for GPU based sketching. Works on sliding window
|
||||
* instead of stream.
|
||||
@@ -9,11 +9,13 @@
|
||||
|
||||
#include <thrust/host_vector.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "../data/device_adapter.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
#include "hist_util.h"
|
||||
#include "quantile.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
#include "timer.h"
|
||||
#include "../data/device_adapter.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -304,7 +306,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||
device, num_cuts_per_feature, true);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
||||
size_t end =
|
||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
ProcessWeightedSlidingWindow(batch, info,
|
||||
num_cuts_per_feature,
|
||||
HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
|
||||
@@ -316,7 +319,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||
device, num_cuts_per_feature, false);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
||||
size_t end =
|
||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
|
||||
sketch_container, num_cuts_per_feature);
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ size_t PeekableInStream::PeekRead(void* dptr, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream), pointer_{0} {
|
||||
FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream) {
|
||||
size_t constexpr kInitialSize = 4096;
|
||||
size_t size{kInitialSize}, total{0};
|
||||
buffer_.clear();
|
||||
|
||||
@@ -27,8 +27,7 @@ using MemoryBufferStream = rabit::utils::MemoryBufferStream;
|
||||
*/
|
||||
class PeekableInStream : public dmlc::Stream {
|
||||
public:
|
||||
explicit PeekableInStream(dmlc::Stream* strm)
|
||||
: strm_(strm), buffer_ptr_(0) {}
|
||||
explicit PeekableInStream(dmlc::Stream* strm) : strm_(strm) {}
|
||||
|
||||
size_t Read(void* dptr, size_t size) override;
|
||||
virtual size_t PeekRead(void* dptr, size_t size);
|
||||
@@ -41,7 +40,7 @@ class PeekableInStream : public dmlc::Stream {
|
||||
/*! \brief input stream */
|
||||
dmlc::Stream *strm_;
|
||||
/*! \brief current buffer pointer */
|
||||
size_t buffer_ptr_;
|
||||
size_t buffer_ptr_{0};
|
||||
/*! \brief internal buffer */
|
||||
std::string buffer_;
|
||||
};
|
||||
@@ -72,7 +71,7 @@ class FixedSizeStream : public PeekableInStream {
|
||||
void Take(std::string* out);
|
||||
|
||||
private:
|
||||
size_t pointer_;
|
||||
size_t pointer_{0};
|
||||
std::string buffer_;
|
||||
};
|
||||
|
||||
|
||||
@@ -710,10 +710,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
|
||||
writer->Save(json);
|
||||
}
|
||||
|
||||
static_assert(std::is_nothrow_move_constructible<Json>::value, "");
|
||||
static_assert(std::is_nothrow_move_constructible<Object>::value, "");
|
||||
static_assert(std::is_nothrow_move_constructible<Array>::value, "");
|
||||
static_assert(std::is_nothrow_move_constructible<String>::value, "");
|
||||
static_assert(std::is_nothrow_move_constructible<Json>::value);
|
||||
static_assert(std::is_nothrow_move_constructible<Object>::value);
|
||||
static_assert(std::is_nothrow_move_constructible<Array>::value);
|
||||
static_assert(std::is_nothrow_move_constructible<String>::value);
|
||||
|
||||
Json UBJReader::ParseArray() {
|
||||
auto marker = PeekNextChar();
|
||||
|
||||
@@ -14,7 +14,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||
if (ctx->IsCPU()) {
|
||||
auto const& h_values = values.ConstHostVector();
|
||||
auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
|
||||
static_assert(std::is_same<decltype(result), double>::value, "");
|
||||
static_assert(std::is_same<decltype(result), double>::value);
|
||||
return result;
|
||||
}
|
||||
return cuda_impl::Reduce(ctx, values);
|
||||
|
||||
@@ -42,8 +42,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
|
||||
*/
|
||||
template <typename InIt, typename OutIt, typename T>
|
||||
void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
|
||||
static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
|
||||
static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
|
||||
static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
|
||||
static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
|
||||
// The number of threads is pegged to the batch size. If the OMP block is parallelized
|
||||
// on anything other than the batch/block size, it should be reassigned
|
||||
auto n = static_cast<size_t>(std::distance(begin, end));
|
||||
|
||||
@@ -31,6 +31,8 @@ namespace common {
|
||||
// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
|
||||
template<size_t BlockSize>
|
||||
class PartitionBuilder {
|
||||
using BitVector = RBitField8;
|
||||
|
||||
public:
|
||||
template<typename Func>
|
||||
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
|
||||
@@ -121,27 +123,11 @@ class PartitionBuilder {
|
||||
bool default_left = tree[nid].DefaultLeft();
|
||||
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
|
||||
auto node_cats = tree.NodeCats(nid);
|
||||
|
||||
auto const& index = gmat.index;
|
||||
auto const& cut_values = gmat.cut.Values();
|
||||
auto const& cut_ptrs = gmat.cut.Ptrs();
|
||||
|
||||
auto gidx_calc = [&](auto ridx) {
|
||||
auto begin = gmat.RowIdx(ridx);
|
||||
if (gmat.IsDense()) {
|
||||
return static_cast<bst_bin_t>(index[begin + fid]);
|
||||
}
|
||||
auto end = gmat.RowIdx(ridx + 1);
|
||||
auto f_begin = cut_ptrs[fid];
|
||||
auto f_end = cut_ptrs[fid + 1];
|
||||
// bypassing the column matrix as we need the cut value instead of bin idx for categorical
|
||||
// features.
|
||||
return BinarySearchBin(begin, end, index, f_begin, f_end);
|
||||
};
|
||||
|
||||
auto pred_hist = [&](auto ridx, auto bin_id) {
|
||||
if (any_cat && is_cat) {
|
||||
auto gidx = gidx_calc(ridx);
|
||||
auto gidx = gmat.GetGindex(ridx, fid);
|
||||
bool go_left = default_left;
|
||||
if (gidx > -1) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
@@ -153,7 +139,7 @@ class PartitionBuilder {
|
||||
};
|
||||
|
||||
auto pred_approx = [&](auto ridx) {
|
||||
auto gidx = gidx_calc(ridx);
|
||||
auto gidx = gmat.GetGindex(ridx, fid);
|
||||
bool go_left = default_left;
|
||||
if (gidx > -1) {
|
||||
if (is_cat) {
|
||||
@@ -199,6 +185,84 @@ class PartitionBuilder {
|
||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief When data is split by column, we don't have all the features locally on the current
|
||||
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
|
||||
* to go right, or if the feature value used for the split is missing.
|
||||
*/
|
||||
void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
|
||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix,
|
||||
const RegTree& tree, const size_t* rid,
|
||||
BitVector* decision_bits, BitVector* missing_bits) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bst_feature_t fid = tree[nid].SplitIndex();
|
||||
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
|
||||
auto node_cats = tree.NodeCats(nid);
|
||||
auto const& cut_values = gmat.cut.Values();
|
||||
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
for (auto row_id : rid_span) {
|
||||
auto gidx = gmat.GetGindex(row_id, fid);
|
||||
if (gidx > -1) {
|
||||
bool go_left = false;
|
||||
if (is_cat) {
|
||||
go_left = Decision(node_cats, cut_values[gidx]);
|
||||
} else {
|
||||
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
||||
}
|
||||
if (go_left) {
|
||||
decision_bits->Set(row_id - gmat.base_rowid);
|
||||
}
|
||||
} else {
|
||||
missing_bits->Set(row_id - gmat.base_rowid);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
|
||||
* use them to partition the rows accordingly.
|
||||
*/
|
||||
void PartitionByMask(const size_t node_in_set,
|
||||
std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
|
||||
const common::Range1d range, GHistIndexMatrix const& gmat,
|
||||
const common::ColumnMatrix& column_matrix, const RegTree& tree,
|
||||
const size_t* rid, BitVector const& decision_bits,
|
||||
BitVector const& missing_bits) {
|
||||
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
|
||||
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
|
||||
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
|
||||
std::size_t nid = nodes[node_in_set].nid;
|
||||
bool default_left = tree[nid].DefaultLeft();
|
||||
|
||||
auto pred_approx = [&](auto ridx) {
|
||||
bool go_left = default_left;
|
||||
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
|
||||
if (!is_missing) {
|
||||
go_left = decision_bits.Check(ridx - gmat.base_rowid);
|
||||
}
|
||||
return go_left;
|
||||
};
|
||||
|
||||
std::pair<size_t, size_t> child_nodes_sizes;
|
||||
if (!column_matrix.IsInitialized()) {
|
||||
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
|
||||
} else {
|
||||
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
|
||||
}
|
||||
|
||||
const size_t n_left = child_nodes_sizes.first;
|
||||
const size_t n_right = child_nodes_sizes.second;
|
||||
|
||||
SetNLeftElems(node_in_set, range.begin(), n_left);
|
||||
SetNRightElems(node_in_set, range.begin(), n_right);
|
||||
}
|
||||
|
||||
// allocate thread local memory, should be called for each specific task
|
||||
void AllocateForTask(size_t id) {
|
||||
if (mem_blocks_[id].get() == nullptr) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2020-2022 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2020-2023 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/binary_search.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
@@ -109,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
|
||||
template <typename T, typename U>
|
||||
void CopyTo(Span<T> out, Span<U> src) {
|
||||
CHECK_EQ(out.size(), src.size());
|
||||
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
|
||||
static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
|
||||
dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
|
||||
out.size_bytes(),
|
||||
cudaMemcpyDefault));
|
||||
@@ -143,7 +143,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
||||
thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));
|
||||
|
||||
dh::XGBCachingDeviceAllocator<Tuple> alloc;
|
||||
static_assert(sizeof(Tuple) == sizeof(SketchEntry), "");
|
||||
static_assert(sizeof(Tuple) == sizeof(SketchEntry));
|
||||
// We reuse the memory for storing merge path.
|
||||
common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
|
||||
// Determine the merge path, 0 if element is from x, 1 if it's from y.
|
||||
|
||||
@@ -24,8 +24,9 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
|
||||
for (size_t i = 0; i < h_features.size(); ++i) {
|
||||
weights[i] = feature_weights_[h_features[i]];
|
||||
}
|
||||
CHECK(ctx_);
|
||||
new_features.HostVector() =
|
||||
WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n);
|
||||
WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
|
||||
} else {
|
||||
new_features.Resize(features.size());
|
||||
std::copy(features.begin(), features.end(), new_features.HostVector().begin());
|
||||
|
||||
@@ -20,7 +20,9 @@
|
||||
#include <vector>
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "algorithm.h" // ArgSort
|
||||
#include "common.h"
|
||||
#include "xgboost/context.h" // Context
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
|
||||
* https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
|
||||
*/
|
||||
template <typename T>
|
||||
std::vector<T> WeightedSamplingWithoutReplacement(
|
||||
std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
|
||||
std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
|
||||
std::vector<float> const& weights, size_t n) {
|
||||
// ES sampling.
|
||||
CHECK_EQ(array.size(), weights.size());
|
||||
std::vector<float> keys(weights.size());
|
||||
@@ -100,7 +102,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
|
||||
auto k = std::log(u) / w;
|
||||
keys[i] = k;
|
||||
}
|
||||
auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
|
||||
auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
|
||||
ind.resize(n);
|
||||
|
||||
std::vector<T> results(ind.size());
|
||||
@@ -126,6 +128,7 @@ class ColumnSampler {
|
||||
float colsample_bytree_{1.0f};
|
||||
float colsample_bynode_{1.0f};
|
||||
GlobalRandomEngine rng_;
|
||||
Context const* ctx_;
|
||||
|
||||
public:
|
||||
std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
|
||||
@@ -157,12 +160,13 @@ class ColumnSampler {
|
||||
* \param colsample_bytree
|
||||
* \param skip_index_0 (Optional) True to skip index 0.
|
||||
*/
|
||||
void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
|
||||
float colsample_bylevel, float colsample_bytree) {
|
||||
void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
|
||||
float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
|
||||
feature_weights_ = std::move(feature_weights);
|
||||
colsample_bylevel_ = colsample_bylevel;
|
||||
colsample_bytree_ = colsample_bytree;
|
||||
colsample_bynode_ = colsample_bynode;
|
||||
ctx_ = ctx;
|
||||
|
||||
if (feature_set_tree_ == nullptr) {
|
||||
feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
|
||||
|
||||
@@ -77,14 +77,14 @@ class RowSetCollection {
|
||||
if (row_indices_.empty()) { // edge case: empty instance set
|
||||
constexpr size_t* kBegin = nullptr;
|
||||
constexpr size_t* kEnd = nullptr;
|
||||
static_assert(kEnd - kBegin == 0, "");
|
||||
elem_of_each_node_.emplace_back(Elem(kBegin, kEnd, 0));
|
||||
static_assert(kEnd - kBegin == 0);
|
||||
elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t* begin = dmlc::BeginPtr(row_indices_);
|
||||
const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
|
||||
elem_of_each_node_.emplace_back(Elem(begin, end, 0));
|
||||
elem_of_each_node_.emplace_back(begin, end, 0);
|
||||
}
|
||||
|
||||
std::vector<size_t>* Data() { return &row_indices_; }
|
||||
|
||||
@@ -35,11 +35,11 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
||||
auto iter = linalg::cbegin(ti_v);
|
||||
float q{0};
|
||||
if (opt_weights.Empty()) {
|
||||
q = common::Quantile(0.5, iter, iter + ti_v.Size());
|
||||
q = common::Quantile(ctx, 0.5, iter, iter + ti_v.Size());
|
||||
} else {
|
||||
CHECK_NE(t_v.Shape(1), 0);
|
||||
auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
|
||||
q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
|
||||
q = common::WeightedQuantile(ctx, 0.5, iter, iter + ti_v.Size(), w_it);
|
||||
}
|
||||
h_out(i) = q;
|
||||
}
|
||||
|
||||
@@ -4,46 +4,52 @@
|
||||
#ifndef XGBOOST_COMMON_STATS_H_
|
||||
#define XGBOOST_COMMON_STATS_H_
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <iterator> // for distance
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "algorithm.h" // for StableSort
|
||||
#include "common.h" // AssertGPUSupport, OptionalWeights
|
||||
#include "optional_weight.h" // OptionalWeights
|
||||
#include "transform_iterator.h" // MakeIndexTransformIter
|
||||
#include "xgboost/context.h" // Context
|
||||
#include "xgboost/linalg.h"
|
||||
#include "xgboost/logging.h" // CHECK_GE
|
||||
#include "xgboost/linalg.h" // TensorView,VectorView
|
||||
#include "xgboost/logging.h" // CHECK_GE
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
/**
|
||||
* \brief Percentile with masked array using linear interpolation.
|
||||
* @brief Quantile using linear interpolation.
|
||||
*
|
||||
* https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
|
||||
*
|
||||
* \param alpha Percentile, must be in range [0, 1].
|
||||
* \param alpha Quantile, must be in range [0, 1].
|
||||
* \param begin Iterator begin for input array.
|
||||
* \param end Iterator end for input array.
|
||||
*
|
||||
* \return The result of interpolation.
|
||||
*/
|
||||
template <typename Iter>
|
||||
float Quantile(double alpha, Iter const& begin, Iter const& end) {
|
||||
float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const& end) {
|
||||
CHECK(alpha >= 0 && alpha <= 1);
|
||||
auto n = static_cast<double>(std::distance(begin, end));
|
||||
if (n == 0) {
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
std::vector<size_t> sorted_idx(n);
|
||||
std::vector<std::size_t> sorted_idx(n);
|
||||
std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
|
||||
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
|
||||
if (omp_in_parallel()) {
|
||||
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
|
||||
} else {
|
||||
StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
|
||||
}
|
||||
|
||||
auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
|
||||
static_assert(std::is_same<decltype(val(0)), float>::value, "");
|
||||
static_assert(std::is_same<decltype(val(0)), float>::value);
|
||||
|
||||
if (alpha <= (1 / (n + 1))) {
|
||||
return val(0);
|
||||
@@ -51,7 +57,7 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
|
||||
if (alpha >= (n / (n + 1))) {
|
||||
return val(sorted_idx.size() - 1);
|
||||
}
|
||||
assert(n != 0 && "The number of rows in a leaf can not be zero.");
|
||||
|
||||
double x = alpha * static_cast<double>((n + 1));
|
||||
double k = std::floor(x) - 1;
|
||||
CHECK_GE(k, 0);
|
||||
@@ -66,30 +72,35 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
|
||||
* \brief Calculate the weighted quantile with step function. Unlike the unweighted
|
||||
* version, no interpolation is used.
|
||||
*
|
||||
* See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
|
||||
* See https://aakinshin.net/posts/weighted-quantiles/ for some discussions on computing
|
||||
* weighted quantile with interpolation.
|
||||
*/
|
||||
template <typename Iter, typename WeightIter>
|
||||
float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
|
||||
float WeightedQuantile(Context const* ctx, double alpha, Iter begin, Iter end, WeightIter w_begin) {
|
||||
auto n = static_cast<double>(std::distance(begin, end));
|
||||
if (n == 0) {
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
std::vector<size_t> sorted_idx(n);
|
||||
std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
|
||||
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
|
||||
if (omp_in_parallel()) {
|
||||
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
|
||||
} else {
|
||||
StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
|
||||
}
|
||||
|
||||
auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
|
||||
|
||||
std::vector<float> weight_cdf(n); // S_n
|
||||
// weighted cdf is sorted during construction
|
||||
weight_cdf[0] = *(weights + sorted_idx[0]);
|
||||
weight_cdf[0] = *(w_begin + sorted_idx[0]);
|
||||
for (size_t i = 1; i < n; ++i) {
|
||||
weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
|
||||
weight_cdf[i] = weight_cdf[i - 1] + w_begin[sorted_idx[i]];
|
||||
}
|
||||
float thresh = weight_cdf.back() * alpha;
|
||||
size_t idx =
|
||||
std::size_t idx =
|
||||
std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
|
||||
idx = std::min(idx, static_cast<size_t>(n - 1));
|
||||
return val(idx);
|
||||
|
||||
Reference in New Issue
Block a user