sync Jun 1

This commit is contained in:
Your Name
2023-06-01 15:55:06 -07:00
76 changed files with 1424 additions and 595 deletions

View File

@@ -3,6 +3,7 @@
*/
#pragma once
#include <string>
#include <vector>
#include "communicator.h"
@@ -224,5 +225,46 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
}
template <typename T>
struct AllgatherVResult {
std::vector<std::size_t> offsets;
std::vector<std::size_t> sizes;
std::vector<T> result;
};
/**
* @brief Gathers variable-length data from all processes and distributes it to all processes.
*
* We assume each worker has the same number of inputs, but each input may be of a different size.
*
* @param inputs All the inputs from the local worker.
* @param sizes Sizes of each input.
*/
template <typename T>
inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
std::vector<std::size_t> const &sizes) {
auto num_inputs = sizes.size();
// Gather the sizes across all workers.
std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
// Calculate input offsets (std::exclusive_scan).
std::vector<std::size_t> offsets(all_sizes.size());
for (std::size_t i = 1; i < offsets.size(); i++) {
offsets[i] = offsets[i - 1] + all_sizes[i - 1];
}
// Gather all the inputs.
auto total_input_size = offsets.back() + all_sizes.back();
std::vector<T> all_inputs(total_input_size);
std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
// We cannot use allgather here, since each worker might have a different size.
Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
return {offsets, all_sizes, all_inputs};
}
} // namespace collective
} // namespace xgboost

View File

@@ -12,19 +12,22 @@
namespace xgboost {
namespace collective {
thread_local int Communicator::device_ordinal_{-1};
thread_local std::unique_ptr<DeviceCommunicator> Communicator::device_communicator_{};
void Communicator::Finalize() {
communicator_->Shutdown();
communicator_.reset(new NoOpCommunicator());
device_ordinal_ = -1;
device_communicator_.reset(nullptr);
}
DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
if (!device_communicator_ || device_ordinal_ != device_ordinal) {
device_ordinal_ = device_ordinal;
thread_local auto old_device_ordinal = -1;
// If the number of GPUs changes, we need to re-initialize NCCL.
thread_local auto old_world_size = -1;
if (!device_communicator_ || device_ordinal != old_device_ordinal ||
communicator_->GetWorldSize() != old_world_size) {
old_device_ordinal = device_ordinal;
old_world_size = communicator_->GetWorldSize();
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
if (type_ != CommunicatorType::kFederated) {
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));

View File

@@ -229,7 +229,6 @@ class Communicator {
static thread_local std::unique_ptr<Communicator> communicator_;
static thread_local CommunicatorType type_;
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
static thread_local int device_ordinal_;
static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
#endif

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2022 by XGBoost Contributors
* Copyright 2022-2023, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
#define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
@@ -16,21 +16,39 @@ struct CUDAContext {
/**
* \brief Caching thrust policy.
*/
#if defined(XGBOOST_USE_HIP)
auto CTP() const { return thrust::hip::par(caching_alloc_).on(dh::DefaultStream()); }
auto CTP() const {
#if defined(XGBOOST_USE_CUDA)
#if THRUST_MAJOR_VERSION >= 2
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
#else
auto CTP() const { return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); }
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
#endif // THRUST_MAJOR_VERSION >= 2
#elif defined(XGBOOST_USE_HIP)
#if THRUST_MAJOR_VERSION >= 2
return thrust::hip::par_nosync(caching_alloc_).on(dh::DefaultStream());
#else
return thrust::hip::par(caching_alloc_).on(dh::DefaultStream());
#endif // THRUST_MAJOR_VERSION >= 2
#endif
}
/**
* \brief Thrust policy without caching allocator.
*/
#if defined(XGBOOST_USE_HIP)
auto TP() const { return thrust::hip::par(alloc_).on(dh::DefaultStream()); }
auto TP() const {
#if defined(XGBOOST_USE_CUDA)
#if THRUST_MAJOR_VERSION >= 2
return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
#else
auto TP() const { return thrust::cuda::par(alloc_).on(dh::DefaultStream()); }
return thrust::cuda::par(alloc_).on(dh::DefaultStream());
#endif // THRUST_MAJOR_VERSION >= 2
#elif defined(XGBOOST_USE_HIP)
#if THRUST_MAJOR_VERSION >= 2
return thrust::hip::par_nosync(alloc_).on(dh::DefaultStream());
#else
return thrust::hip::par(alloc_).on(dh::DefaultStream());
#endif // THRUST_MAJOR_VERSION >= 2
#endif
}
auto Stream() const { return dh::DefaultStream(); }
};
} // namespace xgboost

View File

@@ -227,9 +227,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
return {0, e.index, e.fvalue}; // row_idx is not needed for scanning column size.
});
detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
batch_it, dummy_is_valid,
0, sorted_entries.size(),
&cuts_ptr, &column_sizes_scan);
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
@@ -296,9 +295,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
batch_it, dummy_is_valid,
0, sorted_entries.size(),
&cuts_ptr, &column_sizes_scan);
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,

View File

@@ -17,6 +17,10 @@
#include "quantile.cuh"
#include "timer.h"
#if defined(XGBOOST_USE_HIP)
namespace cub = hipcub;
#endif
namespace xgboost {
namespace common {
namespace cuda {
@@ -53,24 +57,128 @@ struct EntryCompareOp {
};
// Get column size from adapter batch and for output cuts.
template <typename Iter>
void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feature,
Iter batch_iter, data::IsValidFunctor is_valid,
size_t begin, size_t end,
HostDeviceVector<SketchContainer::OffsetT> *cuts_ptr,
template <std::uint32_t kBlockThreads, typename CounterT, typename BatchIt>
__global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid,
Span<std::size_t> out_column_size) {
extern __shared__ char smem[];
auto smem_cs_ptr = reinterpret_cast<CounterT*>(smem);
dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
cub::CTA_SYNC();
auto n = batch_iter.size();
for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n)) {
auto e = batch_iter[idx];
if (is_valid(e)) {
atomicAdd(&smem_cs_ptr[e.column_idx], static_cast<CounterT>(1));
}
}
cub::CTA_SYNC();
auto out_global_ptr = out_column_size;
for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
atomicAdd(&out_global_ptr[i], static_cast<std::size_t>(smem_cs_ptr[i]));
}
}
template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
#endif
int n_blocks_per_mp = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
#endif
std::uint32_t grid_size = n_blocks_per_mp * n_mps;
return grid_size;
}
/**
* \brief Get the size of each column. This is a histogram with additional handling of
* invalid values.
*
* \tparam BatchIt Type of input adapter batch.
* \tparam force_use_global_memory Used for testing. Force global atomic add.
* \tparam force_use_u64 Used for testing. For u64 as counter in shared memory.
*
* \param device CUDA device ordinal.
* \param batch_iter Iterator for input data from adapter batch.
* \param is_valid Whehter an element is considered as missing.
* \param out_column_size Output buffer for the size of each column.
*/
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
std::size_t max_shared_memory = dh::MaxSharedMemory(device);
// Not strictly correct as we should use number of samples to determine the type of
// counter. However, the sample size is not known due to sliding window on number of
// elements.
std::size_t n = batch_iter.size();
std::size_t required_shared_memory = 0;
bool use_u32{false};
if (!force_use_u64 && n < static_cast<std::size_t>(std::numeric_limits<std::uint32_t>::max())) {
required_shared_memory = out_column_size.size() * sizeof(std::uint32_t);
use_u32 = true;
} else {
required_shared_memory = out_column_size.size() * sizeof(std::size_t);
use_u32 = false;
}
bool use_shared = required_shared_memory <= max_shared_memory && required_shared_memory != 0;
if (!force_use_global_memory && use_shared) {
CHECK_NE(required_shared_memory, 0);
std::uint32_t constexpr kBlockThreads = 512;
if (use_u32) {
CHECK(!force_use_u64);
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
kernel, batch_iter, is_valid, out_column_size);
} else {
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
kernel, batch_iter, is_valid, out_column_size);
}
} else {
auto d_out_column_size = out_column_size;
dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
auto e = batch_iter[idx];
if (is_valid(e)) {
atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
}
});
}
}
template <typename BatchIt>
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
dh::caching_device_vector<size_t>* column_sizes_scan) {
column_sizes_scan->resize(num_columns + 1, 0);
column_sizes_scan->resize(num_columns + 1);
cuts_ptr->SetDevice(device);
cuts_ptr->Resize(num_columns + 1, 0);
dh::XGBCachingDeviceAllocator<char> alloc;
auto d_column_sizes_scan = column_sizes_scan->data().get();
dh::LaunchN(end - begin, [=] __device__(size_t idx) {
auto e = batch_iter[begin + idx];
if (is_valid(e)) {
atomicAdd(&d_column_sizes_scan[e.column_idx], static_cast<size_t>(1));
}
});
auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
// Calculate cuts CSC pointer
auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
column_sizes_scan->begin(), [=] __device__(size_t column_size) {
@@ -85,8 +193,7 @@ void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feat
column_sizes_scan->end(), column_sizes_scan->begin());
#elif defined(XGBOOST_USE_CUDA)
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
cut_ptr_it + column_sizes_scan->size(),
cuts_ptr->DevicePointer());
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
column_sizes_scan->end(), column_sizes_scan->begin());
#endif
@@ -130,29 +237,26 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
// Count the valid entries in each column and copy them out.
template <typename AdapterBatch, typename BatchIter>
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
Range1d range, float missing,
size_t columns, size_t cuts_per_feature, int device,
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
float missing, size_t columns, size_t cuts_per_feature, int device,
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
dh::caching_device_vector<size_t>* column_sizes_scan,
dh::device_vector<Entry>* sorted_entries) {
auto entry_iter = dh::MakeTransformIterator<Entry>(
thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) {
return Entry(batch.GetElement(idx).column_idx,
batch.GetElement(idx).value);
return Entry(batch.GetElement(idx).column_idx, batch.GetElement(idx).value);
});
auto n = range.end() - range.begin();
auto span = IterSpan{batch_iter + range.begin(), n};
data::IsValidFunctor is_valid(missing);
// Work out how many valid entries we have in each column
GetColumnSizesScan(device, columns, cuts_per_feature,
batch_iter, is_valid,
range.begin(), range.end(),
cut_sizes_scan,
GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
column_sizes_scan);
size_t num_valid = column_sizes_scan->back();
// Copy current subset of valid elements into temporary storage and sort
sorted_entries->resize(num_valid);
dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(),
sorted_entries->begin(), is_valid);
dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
is_valid);
}
void SortByWeight(dh::device_vector<float>* weights,

View File

@@ -209,7 +209,7 @@ class PartitionBuilder {
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bst_feature_t fid = tree.SplitIndex(nid);
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
@@ -263,14 +263,13 @@ class PartitionBuilder {
template <typename ExpandEntry>
void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
const RegTree& tree, const size_t* rid, BitVector const& decision_bits,
BitVector const& missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();
bool default_left = tree.DefaultLeft(nid);
auto pred = [&](auto ridx) {
bool go_left = default_left;

View File

@@ -7,7 +7,6 @@
#include <utility>
#include "../collective/aggregator.h"
#include "../collective/communicator-inl.h"
#include "../data/adapter.h"
#include "categorical.h"
#include "hist_util.h"
@@ -143,6 +142,7 @@ struct QuantileAllreduce {
template <typename WQSketch>
void SketchContainerImpl<WQSketch>::GatherSketchInfo(
MetaInfo const& info,
std::vector<typename WQSketch::SummaryContainer> const &reduced,
std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -168,7 +168,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);
// Gather all column pointers
collective::Allreduce<collective::Operation::kSum>(sketches_scan.data(), sketches_scan.size());
collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
for (int32_t i = 0; i < world; ++i) {
size_t back = (i + 1) * (n_columns + 1) - 1;
auto n_entries = sketches_scan.at(back);
@@ -196,7 +196,8 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
"Unexpected size of sketch entry.");
collective::Allreduce<collective::Operation::kSum>(
collective::GlobalSum(
info,
reinterpret_cast<float *>(global_sketches.data()),
global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
}
@@ -222,8 +223,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
size_t feat_begin = rank * feature_ptr.size(); // pointer to current worker
std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
collective::Allreduce<collective::Operation::kSum>(global_feat_ptrs.data(),
global_feat_ptrs.size());
collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());
// move all categories into a flatten vector to prepare for allreduce
size_t total = feature_ptr.back();
@@ -236,8 +236,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
// indptr for indexing workers
std::vector<size_t> global_worker_ptr(world_size + 1, 0);
global_worker_ptr[rank + 1] = total; // shift 1 to right for constructing the indptr
collective::Allreduce<collective::Operation::kSum>(global_worker_ptr.data(),
global_worker_ptr.size());
collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
// total number of categories in all workers with all features
auto gtotal = global_worker_ptr.back();
@@ -249,8 +248,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
CHECK_EQ(rank_size, total);
std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
// gather values from all workers.
collective::Allreduce<collective::Operation::kSum>(global_categories.data(),
global_categories.size());
collective::GlobalSum(info, global_categories.data(), global_categories.size());
QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
categories_.size()};
ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -323,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
std::vector<typename WQSketch::Entry> global_sketches;
this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan, &global_sketches);
this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);
std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);
@@ -371,7 +369,9 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
InvalidCategory();
}
auto &cut_values = cuts->cut_values_.HostVector();
auto max_cat = *std::max_element(categories.cbegin(), categories.cend());
// With column-wise data split, the categories may be empty.
auto max_cat =
categories.empty() ? 0.0f : *std::max_element(categories.cbegin(), categories.cend());
CheckMaxCat(max_cat, categories.size());
for (bst_cat_t i = 0; i <= AsCat(max_cat); ++i) {
cut_values.push_back(i);

View File

@@ -822,7 +822,8 @@ class SketchContainerImpl {
return group_ind;
}
// Gather sketches from all workers.
void GatherSketchInfo(std::vector<typename WQSketch::SummaryContainer> const &reduced,
void GatherSketchInfo(MetaInfo const& info,
std::vector<typename WQSketch::SummaryContainer> const &reduced,
std::vector<bst_row_t> *p_worker_segments,
std::vector<bst_row_t> *p_sketches_scan,
std::vector<typename WQSketch::Entry> *p_global_sketches);

View File

@@ -26,6 +26,12 @@
#include "xgboost/logging.h"
#include "xgboost/span.h"
#if defined(XGBOOST_USE_CUDA)
#include "cuda_fp16.h"
#elif defined(__HIP_PLATFORM_AMD__)
#include <hip/hip_fp16.h>
#endif
namespace xgboost {
// Common errors in parsing columnar format.
struct ArrayInterfaceErrors {
@@ -304,12 +310,12 @@ class ArrayInterfaceHandler {
template <typename T, typename E = void>
struct ToDType;
// float
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
template <>
struct ToDType<__half> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
};
#endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
template <>
struct ToDType<float> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@@ -459,11 +465,11 @@ class ArrayInterface {
CHECK(sizeof(long double) == 16) << error::NoF128();
type = T::kF16;
} else if (typestr[1] == 'f' && typestr[2] == '2') {
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
type = T::kF2;
#else
LOG(FATAL) << "Half type is not supported.";
#endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
} else if (typestr[1] == 'f' && typestr[2] == '4') {
type = T::kF4;
} else if (typestr[1] == 'f' && typestr[2] == '8') {
@@ -490,20 +496,17 @@ class ArrayInterface {
}
}
XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
[[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; }
[[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; }
template <typename Fn>
XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
using T = ArrayInterfaceHandler::Type;
switch (type) {
case T::kF2: {
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
return func(reinterpret_cast<__half const *>(data));
#else
SPAN_CHECK(false);
return func(reinterpret_cast<float const *>(data));
#endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__)
}
case T::kF4:
return func(reinterpret_cast<float const *>(data));
@@ -540,23 +543,23 @@ class ArrayInterface {
return func(reinterpret_cast<uint64_t const *>(data));
}
XGBOOST_DEVICE std::size_t ElementSize() const {
[[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
});
}
XGBOOST_DEVICE std::size_t ElementAlignment() const {
[[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
});
}
template <typename T = float, typename... Index>
XGBOOST_DEVICE T operator()(Index &&...index) const {
XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const {
static_assert(sizeof...(index) <= D, "Invalid index.");
return this->DispatchCall([=](auto const *p_values) -> T {
std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
// No operator defined for half -> size_t
using Type = std::conditional_t<
std::is_same<__half,
@@ -566,7 +569,7 @@ class ArrayInterface {
return static_cast<T>(static_cast<Type>(p_values[offset]));
#else
return static_cast<T>(p_values[offset]);
#endif
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
});
}
@@ -603,7 +606,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
};
switch (array.type) {
case ArrayInterfaceHandler::kF2: {
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
dispatch(__half{});
#endif
break;

View File

@@ -698,6 +698,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
this->feature_type_names = that.feature_type_names;
auto &h_feature_types = feature_types.HostVector();
LoadFeatureType(this->feature_type_names, &h_feature_types);
} else if (!that.feature_types.Empty()) {
this->feature_types.Resize(that.feature_types.Size());
this->feature_types.Copy(that.feature_types);
}
if (!that.feature_weights.Empty()) {
this->feature_weights.Resize(that.feature_weights.Size());

View File

@@ -29,7 +29,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
: columns_(columns),
num_rows_(num_rows) {}
size_t Size() const { return num_rows_ * columns_.size(); }
__device__ COOTuple GetElement(size_t idx) const {
__device__ __forceinline__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % columns_.size();
size_t row_idx = idx / columns_.size();
auto const& column = columns_[column_idx];
@@ -39,6 +39,14 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
return {row_idx, column_idx, value};
}
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
auto const& column = columns_[fidx];
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
? column(ridx)
: std::numeric_limits<float>::quiet_NaN();
return value;
}
XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
@@ -166,6 +174,10 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
float value = array_interface_(row_idx, column_idx);
return {row_idx, column_idx, value};
}
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
float value = array_interface_(ridx, fidx);
return value;
}
XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
@@ -202,40 +214,64 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
// Returns maximum row length
template <typename AdapterBatchT>
size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
int device_idx, float missing) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#elif defined(XGBOOST_USE_CUDA)
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
float missing) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#endif
auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();
// Use more than 1 threads for each row in case of dataset being too wide.
bst_feature_t stride{0};
if (n_features < 32) {
stride = std::min(n_features, 4u);
} else if (n_features < 64) {
stride = 8;
} else if (n_features < 128) {
stride = 16;
} else {
stride = 32;
}
// Count elements per row
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
auto element = batch.GetElement(idx);
if (is_valid(element)) {
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
&offset[element.row_idx]),
static_cast<unsigned long long>(1)); // NOLINT
dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
bst_row_t cnt{0};
auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
SPAN_CHECK(ridx < n_samples);
for (bst_feature_t fidx = fbeg; fidx < n_features; fidx += stride) {
if (is_valid(batch.GetElement(ridx, fidx))) {
cnt++;
}
}
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
&offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT
});
dh::XGBCachingDeviceAllocator<char> alloc;
#if defined(XGBOOST_USE_HIP)
size_t row_stride =
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<std::size_t>(0), thrust::maximum<size_t>());
#elif defined(XGBOOST_USE_CUDA)
size_t row_stride =
#if defined(XGBOOST_USE_CUDA)
bst_row_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<std::size_t>(0), thrust::maximum<size_t>());
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
#elif defined(XGBOOST_USE_HIP)
bst_row_t row_stride =
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
#endif
return row_stride;
}
@@ -243,13 +279,29 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
* \brief Check there's no inf in data.
*/
template <typename AdapterBatchT>
bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
auto counting = thrust::make_counting_iterator(0llu);
auto value_iter = dh::MakeTransformIterator<float>(
counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
auto valid =
thrust::none_of(value_iter, value_iter + batch.Size(),
[is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
auto v = batch.GetElement(idx).value;
if (!is_valid(v)) {
// discard the invalid elements.
return true;
}
// check that there's no inf in data.
return !std::isinf(v);
});
dh::XGBCachingDeviceAllocator<char> alloc;
// The default implementation in thrust optimizes any_of/none_of/all_of by using small
// intervals to early stop. But we expect all data to be valid here, using small
// intervals only decreases performance due to excessive kernel launch and stream
// synchronization.
#if defined(XGBOOST_USE_CUDA)
auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
thrust::logical_and<>{});
#elif defined(XGBOOST_USE_HIP)
auto valid = dh::Reduce(thrust::hip::par(alloc), value_iter, value_iter + batch.Size(), true,
thrust::logical_and<>{});
#endif
return valid;
}
}; // namespace data

View File

@@ -213,7 +213,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
// correct output position
auto counting = thrust::make_counting_iterator(0llu);
data::IsValidFunctor is_valid(missing);
bool valid = data::HasInfInData(batch, is_valid);
bool valid = data::NoInfInData(batch, is_valid);
CHECK(valid) << error::InfInData();
auto key_iter = dh::MakeTransformIterator<size_t>(

View File

@@ -92,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
}
auto batch_rows = num_rows();
accumulated_rows += batch_rows;
dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);
@@ -163,7 +163,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
#endif
auto rows = num_rows();
dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
dh::device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);

View File

@@ -92,7 +92,7 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
template <typename AdapterBatchT>
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
SparsePage* page) {
bool valid = HasInfInData(batch, IsValidFunctor{missing});
bool valid = NoInfInData(batch, IsValidFunctor{missing});
CHECK(valid) << error::InfInData();
page->offset.SetDevice(device);

View File

@@ -67,7 +67,7 @@ class ColumnSplitHelper {
const int32_t nid = nodes[node_in_set].nid;
const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
partition_builder_->AllocateForTask(task_id);
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
(*row_set_collection_)[nid].begin, decision_bits_,
missing_bits_);
});

View File

@@ -25,7 +25,6 @@
#include "xgboost/linalg.h" // for Constants, Vector
namespace xgboost::tree {
template <typename ExpandEntry>
class HistEvaluator {
private:
struct NodeEntry {
@@ -285,10 +284,42 @@ class HistEvaluator {
return left_sum;
}
/**
* @brief Gather the expand entries from all the workers.
* @param entries Local expand entries on this worker.
* @return Global expand entries gathered from all workers.
*/
std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
// First, gather all the primitive fields.
std::vector<CPUExpandEntry> all_entries(num_entries * world);
std::vector<uint32_t> cat_bits;
std::vector<std::size_t> cat_bits_sizes;
for (std::size_t i = 0; i < num_entries; i++) {
all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
}
collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
// Gather all the cat_bits.
auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
// Copy the cat_bits back into all expand entries.
all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
all_entries[i].split.cat_bits.begin());
});
return all_entries;
}
public:
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
common::Span<FeatureType const> feature_types, const RegTree &tree,
std::vector<ExpandEntry> *p_entries) {
std::vector<CPUExpandEntry> *p_entries) {
auto n_threads = ctx_->Threads();
auto& entries = *p_entries;
// All nodes are on the same level, so we can store the shared ptr.
@@ -306,7 +337,7 @@ class HistEvaluator {
return features[nidx_in_set]->Size();
}, grain_size);
std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
for (size_t i = 0; i < entries.size(); ++i) {
for (decltype(n_threads) j = 0; j < n_threads; ++j) {
tloc_candidates[i * n_threads + j] = entries[i];
@@ -365,22 +396,18 @@ class HistEvaluator {
if (is_col_split_) {
// With column-wise data split, we gather the best splits from all the workers and update the
// expand entries accordingly.
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
std::vector<ExpandEntry> buffer{num_entries * world};
std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
for (auto worker = 0; worker < world; ++worker) {
auto all_entries = Allgather(entries);
for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
entries[nidx_in_set].split.Update(
all_entries[worker * entries.size() + nidx_in_set].split);
}
}
}
}
// Add splits to tree, handles all statistic
void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
auto evaluator = tree_evaluator_.GetEvaluator();
RegTree &tree = *p_tree;
@@ -465,6 +492,7 @@ class HistMultiEvaluator {
FeatureInteractionConstraintHost interaction_constraints_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
Context const *ctx_;
bool is_col_split_{false};
private:
static double MultiCalcSplitGain(TrainParam const &param,
@@ -543,6 +571,57 @@ class HistMultiEvaluator {
return false;
}
/**
* @brief Gather the expand entries from all the workers.
* @param entries Local expand entries on this worker.
* @return Global expand entries gathered from all workers.
*/
std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
auto const num_entries = entries.size();
// First, gather all the primitive fields.
std::vector<MultiExpandEntry> all_entries(num_entries * world);
std::vector<uint32_t> cat_bits;
std::vector<std::size_t> cat_bits_sizes;
std::vector<GradientPairPrecise> gradients;
for (std::size_t i = 0; i < num_entries; i++) {
all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
&gradients);
}
collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
// Gather all the cat_bits.
auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
// Gather all the gradients.
auto const num_gradients = gradients.size();
std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
auto const total_entries = num_entries * world;
auto const gradients_per_entry = num_gradients / num_entries;
auto const gradients_per_side = gradients_per_entry / 2;
common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
// Copy the cat_bits back into all expand entries.
all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
// Copy the gradients back into all expand entries.
all_entries[i].split.left_sum.resize(gradients_per_side);
std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
all_entries[i].split.left_sum.begin());
all_entries[i].split.right_sum.resize(gradients_per_side);
std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
gradients_per_side, all_entries[i].split.right_sum.begin());
});
return all_entries;
}
public:
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -597,6 +676,18 @@ class HistMultiEvaluator {
entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
}
}
if (is_col_split_) {
// With column-wise data split, we gather the best splits from all the workers and update the
// expand entries accordingly.
auto all_entries = Allgather(entries);
for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
entries[nidx_in_set].split.Update(
all_entries[worker * entries.size() + nidx_in_set].split);
}
}
}
}
linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
@@ -660,7 +751,10 @@ class HistMultiEvaluator {
explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
std::shared_ptr<common::ColumnSampler> sampler)
: param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
: param_{param},
column_sampler_{std::move(sampler)},
ctx_{ctx},
is_col_split_{info.IsColumnSplit()} {
interaction_constraints_.Configure(*param, info.num_col_);
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
param_->colsample_bynode, param_->colsample_bylevel,

View File

@@ -70,6 +70,22 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
os << "split:\n" << e.split << std::endl;
return os;
}
/**
* @brief Copy primitive fields into this, and collect cat_bits into a vector.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
*/
void CopyAndCollect(CPUExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
std::vector<std::size_t>* cat_bits_sizes) {
nid = that.nid;
depth = that.depth;
split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes);
}
};
struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
@@ -119,6 +135,24 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
os << "]\n";
return os;
}
/**
* @brief Copy primitive fields into this, and collect cat_bits and gradients into vectors.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
* @param collected_gradients The vector to collect gradients
*/
void CopyAndCollect(MultiExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
std::vector<std::size_t>* cat_bits_sizes,
std::vector<GradientPairPrecise>* collected_gradients) {
nid = that.nid;
depth = that.depth;
split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes, collected_gradients);
}
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_

View File

@@ -419,6 +419,60 @@ struct SplitEntryContainer {
<< "right_sum: " << s.right_sum << std::endl;
return os;
}
/**
* @brief Copy primitive fields into this, and collect cat_bits into a vector.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
*/
void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
std::vector<uint32_t> *collected_cat_bits,
std::vector<std::size_t> *cat_bits_sizes) {
loss_chg = that.loss_chg;
sindex = that.sindex;
split_value = that.split_value;
is_cat = that.is_cat;
static_assert(std::is_trivially_copyable_v<GradientT>);
left_sum = that.left_sum;
right_sum = that.right_sum;
collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
that.cat_bits.cend());
cat_bits_sizes->emplace_back(that.cat_bits.size());
}
/**
* @brief Copy primitive fields into this, and collect cat_bits and gradient sums into vectors.
*
* This is used for allgather.
*
* @param that The other entry to copy from
* @param collected_cat_bits The vector to collect cat_bits
* @param cat_bits_sizes The sizes of the collected cat_bits
* @param collected_gradients The vector to collect gradients
*/
template <typename G>
void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
std::vector<uint32_t> *collected_cat_bits,
std::vector<std::size_t> *cat_bits_sizes,
std::vector<G> *collected_gradients) {
loss_chg = that.loss_chg;
sindex = that.sindex;
split_value = that.split_value;
is_cat = that.is_cat;
collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
that.cat_bits.cend());
cat_bits_sizes->emplace_back(that.cat_bits.size());
static_assert(!std::is_trivially_copyable_v<GradientT>);
collected_gradients->insert(collected_gradients->end(), that.left_sum.cbegin(),
that.left_sum.cend());
collected_gradients->insert(collected_gradients->end(), that.right_sum.cbegin(),
that.right_sum.cend());
}
/*!\return feature index to split on */
[[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
/*!\return whether missing value goes to left branch */

View File

@@ -44,7 +44,7 @@ class GloablApproxBuilder {
protected:
TrainParam const *param_;
std::shared_ptr<common::ColumnSampler> col_sampler_;
HistEvaluator<CPUExpandEntry> evaluator_;
HistEvaluator evaluator_;
HistogramBuilder<CPUExpandEntry> histogram_builder_;
Context const *ctx_;
ObjInfo const *const task_;

View File

@@ -13,6 +13,7 @@
#include <utility> // for move, swap
#include <vector> // for vector
#include "../collective/aggregator.h" // for GlobalSum
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../collective/communicator.h" // for Operation
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
@@ -200,8 +201,8 @@ class MultiTargetHistBuilder {
}
}
CHECK(root_sum.CContiguous());
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
root_sum.Size() * 2);
std::vector<MultiExpandEntry> nodes{best};
std::size_t i = 0;
@@ -335,7 +336,7 @@ class HistBuilder {
common::Monitor *monitor_;
TrainParam const *param_;
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
std::unique_ptr<HistEvaluator> evaluator_;
std::vector<CommonRowPartitioner> partitioner_;
// back pointers to tree and data matrix
@@ -354,7 +355,7 @@ class HistBuilder {
: monitor_{monitor},
param_{param},
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
col_sampler_)},
p_last_fmat_(fmat),
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
@@ -395,8 +396,7 @@ class HistBuilder {
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), fmat->Info().IsColumnSplit());
evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
col_sampler_);
evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
p_last_tree_ = p_tree;
monitor_->Stop(__func__);
}
@@ -455,8 +455,7 @@ class HistBuilder {
for (auto const &grad : gpair_h) {
grad_stat.Add(grad.GetGrad(), grad.GetHess());
}
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
2);
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
}
auto weight = evaluator_->InitRoot(GradStats{grad_stat});

View File

@@ -20,7 +20,7 @@ namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_refresh);
/*! \brief pruner that prunes a tree after growing finishs */
/*! \brief pruner that prunes a tree after growing finishes */
class TreeRefresher : public TreeUpdater {
public:
explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}