temp merge, disable 1 line, SetValid

This commit is contained in:
Your Name
2023-10-12 16:16:44 -07:00
492 changed files with 15533 additions and 9376 deletions

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019 XGBoost contributors
/**
* Copyright 2019-2023, XGBoost contributors
*/
#include <thrust/copy.h>
#include <thrust/device_vector.h>
@@ -140,20 +140,20 @@ void FeatureInteractionConstraintDevice::Reset() {
__global__ void ClearBuffersKernel(
LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < result_buffer_output.Size()) {
if (tid < result_buffer_output.Capacity()) {
result_buffer_output.Clear(tid);
}
if (tid < result_buffer_input.Size()) {
if (tid < result_buffer_input.Capacity()) {
result_buffer_input.Clear(tid);
}
}
void FeatureInteractionConstraintDevice::ClearBuffers() {
CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
uint32_t constexpr kBlockThreads = 256;
auto const n_grids = static_cast<uint32_t>(
common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
dh::LaunchKernel {n_grids, kBlockThreads} (
ClearBuffersKernel,
output_buffer_bits_, input_buffer_bits_);
@@ -207,11 +207,11 @@ common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
ClearBuffers();
LBitField64 node_constraints = s_node_constraints_[nid];
CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());
uint32_t constexpr kBlockThreads = 256;
auto n_grids = static_cast<uint32_t>(
common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
dh::LaunchKernel {n_grids, kBlockThreads} (
SetInputBufferKernel,
feature_list, input_buffer_bits_);
@@ -274,13 +274,13 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
LBitField64 left,
LBitField64 right) {
auto tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid > node.Size()) {
if (tid > node.Capacity()) {
return;
}
// enable constraints from feature
node |= feature;
// clear the buffer after use
if (tid < feature.Size()) {
if (tid < feature.Capacity()) {
feature.Clear(tid);
}
@@ -323,7 +323,7 @@ void FeatureInteractionConstraintDevice::Split(
s_sets_, s_sets_ptr_);
uint32_t constexpr kBlockThreads = 256;
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));
dh::LaunchKernel {n_grids, kBlockThreads} (
InteractionConstraintSplitKernel,

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2018-2019 by Contributors
/**
* Copyright 2018-2023 by Contributors
*/
#ifndef XGBOOST_TREE_CONSTRAINTS_H_
#define XGBOOST_TREE_CONSTRAINTS_H_
@@ -8,10 +8,8 @@
#include <unordered_set>
#include <vector>
#include "xgboost/span.h"
#include "xgboost/base.h"
#include "param.h"
#include "xgboost/base.h"
namespace xgboost {
/*!

View File

@@ -55,27 +55,26 @@ void FitStump(Context const* ctx, MetaInfo const& info,
} // namespace cpu_impl
namespace cuda_impl {
void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
linalg::VectorView<float> out);
void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out);
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<GradientPair const, 2>,
linalg::VectorView<float>) {
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace cuda_impl
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
bst_target_t n_targets, linalg::Vector<float>* out) {
out->SetDevice(ctx->gpu_id);
out->Reshape(n_targets);
auto n_samples = gpair.Size() / n_targets;
gpair.SetDevice(ctx->gpu_id);
auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
gpair.SetDevice(ctx->Device());
auto gpair_t = gpair.View(ctx->Device());
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
: cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
}
} // namespace tree
} // namespace xgboost

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2022 by XGBoost Contributors
* Copyright 2022-2023 by XGBoost Contributors
*
* \brief Utilities for estimating initial score.
*/
@@ -11,6 +11,7 @@
#include <cstddef> // std::size_t
#include "../collective/aggregator.cuh"
#include "../collective/communicator-inl.cuh"
#include "../common/device_helpers.cuh" // dh::MakeTransformIterator
#include "fit_stump.h"
@@ -23,8 +24,8 @@
namespace xgboost {
namespace tree {
namespace cuda_impl {
void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
linalg::VectorView<float> out) {
void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
auto n_targets = out.Size();
CHECK_EQ(n_targets, gpair.Shape(1));
linalg::Vector<GradientPairPrecise> sum = linalg::Constant(ctx, GradientPairPrecise{}, n_targets);
@@ -41,7 +42,7 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
auto sample = i % gpair.Shape(0);
return GradientPairPrecise{gpair(sample, target)};
});
auto d_sum = sum.View(ctx->gpu_id);
auto d_sum = sum.View(ctx->Device());
CHECK(d_sum.CContiguous());
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -55,8 +56,8 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
collective::AllReduce<collective::Operation::kSum>(
ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()), d_sum.Size() * 2);
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
d_sum.Size() * 2);
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
[=] XGBOOST_DEVICE(std::size_t i) mutable {

View File

@@ -31,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
/**
* @brief Fit a tree stump as an estimation of base_score.
*/
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
bst_target_t n_targets, linalg::Vector<float>* out);
} // namespace tree
} // namespace xgboost

View File

@@ -1,12 +1,12 @@
/*!
* Copyright 2020-2022 by XGBoost Contributors
/**
* Copyright 2020-2023, XGBoost Contributors
*/
#include <algorithm> // std::max
#include <vector>
#include <limits>
#include "../../collective/communicator-inl.cuh"
#include "../../common/categorical.h"
#include "../../common/device_helpers.cuh"
#include "../../data/ellpack_page.cuh"
#include "evaluate_splits.cuh"
#include "expand_entry.cuh"
@@ -24,13 +24,11 @@
#define WARP_SIZE 32
#endif
namespace xgboost {
#if defined(XGBOOST_USE_HIP)
namespace cub = hipcub;
#endif
namespace tree {
namespace xgboost::tree {
// With constraints
XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
const GradientPairInt64 &missing,
@@ -352,11 +350,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
common::Span<common::CatBitField::value_type> out,
DeviceSplitCandidate *p_out_split) {
auto &out_split = *p_out_split;
out_split.split_cats = common::CatBitField{out};
auto out_cats = common::CatBitField{out};
// Simple case for one hot split
if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
out_split.split_cats.Set(common::AsCat(out_split.thresh));
out_cats.Set(common::AsCat(out_split.thresh));
return;
}
@@ -376,7 +374,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
assert(partition > 0 && "Invalid partition.");
thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
auto cat = shared_inputs.feature_values[c - node_offset];
out_split.SetCat(cat);
out_cats.Set(common::AsCat(cat));
});
}
@@ -453,6 +451,24 @@ void GPUHistEvaluator::EvaluateSplits(
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
evaluator, out_splits);
if (is_column_split_) {
// With column-wise data split, we gather the split candidates from all the workers and find the
// global best candidates.
auto const world_size = collective::GetWorldSize();
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
auto all_candidates = dh::ToSpan(all_candidate_storage);
collective::AllGather(device_, out_splits.data(), all_candidates.data(),
out_splits.size() * sizeof(DeviceSplitCandidate));
// Reduce to get the best candidate from all workers.
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
out_splits[i] = all_candidates[i];
for (auto rank = 1; rank < world_size; rank++) {
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
}
});
}
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
auto d_entries = out_entries;
auto device_cats_accessor = this->DeviceCatStorage(nidx);
@@ -471,8 +487,7 @@ void GPUHistEvaluator::EvaluateSplits(
if (split.is_cat) {
SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
device_cats_accessor.GetNodeCatStorage(input.nidx),
&out_splits[i]);
device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
}
float base_weight =
@@ -510,6 +525,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
#endif
return root_entry;
}
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
common::Span<const float> feature_values;
common::Span<const float> min_fvalue;
bool is_dense;
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
__device__ auto FeatureBins(bst_feature_t fidx) const {
[[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
[[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
return feature_segments[fidx + 1] - feature_segments[fidx];
}
};
@@ -83,6 +83,9 @@ class GPUHistEvaluator {
// Number of elements of categorical storage type
// needed to hold categoricals for a single mode
std::size_t node_categorical_storage_size_ = 0;
// Is the data split column-wise?
bool is_column_split_ = false;
int32_t device_;
// Copy the categories from device to host asynchronously.
void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -102,7 +105,7 @@ class GPUHistEvaluator {
}
/**
* \brief Get device category storage of nidx for internal calculation.
* @brief Get device category storage of nidx for internal calculation.
*/
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{};
@@ -117,8 +120,8 @@ class GPUHistEvaluator {
/**
* \brief Get sorted index storage based on the left node of inputs.
*/
auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
if(!need_sort_histogram_) return common::Span<bst_feature_t>();
auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
cat_sorted_idx_.resize(num_nodes * total_bins);
return dh::ToSpan(cat_sorted_idx_);
}
@@ -136,18 +139,29 @@ class GPUHistEvaluator {
* \brief Reset the evaluator, should be called before any use.
*/
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param, int32_t device);
bst_feature_t n_features, TrainParam const &param, bool is_column_split,
int32_t device);
/**
* \brief Get host category storage for nidx. Different from the internal version, this
* returns strictly 1 node.
*/
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
[[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
copy_stream_.View().Sync();
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
[[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
copy_stream_.View().Sync();
if (has_categoricals_) {
CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
} else {
return common::KCatBitField{};
}
}
/**
* \brief Add a split to the internal tree evaluator.
*/

View File

@@ -14,10 +14,9 @@
namespace xgboost {
namespace tree {
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param,
int32_t device) {
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param,
bool is_column_split, int32_t device) {
param_ = param;
tree_evaluator_ = TreeEvaluator{param, n_features, device};
has_categoricals_ = cuts.HasCategorical();
@@ -93,6 +92,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
});
#endif
}
is_column_split_ = is_column_split;
device_ = device;
}
common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(

View File

@@ -8,10 +8,10 @@
#include <xgboost/logging.h>
#include <algorithm>
#include <cstddef> // for size_t
#include <limits>
#include <utility>
#include "../../common/compressed_iterator.h"
#include "../../common/cuda_context.cuh" // for CUDAContext
#include "../../common/random.h"
#include "../param.h"
@@ -146,27 +146,30 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
CombineGradientPair combine_;
};
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) {
return {dmat->Info().num_row_, page_, gpair};
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
return {dmat->Info().num_row_, page, gpair};
}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
size_t n_rows, BatchParam batch_param)
: batch_param_{std::move(batch_param)},
page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
n_rows)) {}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
: batch_param_{std::move(batch_param)} {}
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
common::Span<GradientPair> gpair,
DMatrix* dmat) {
if (!page_concatenated_) {
// Concatenate all the external memory ELLPACK pages into a single in-memory page.
page_.reset(nullptr);
size_t offset = 0;
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
auto page = batch.Impl();
if (!page_) {
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
page->row_stride, dmat->Info().num_row_);
}
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
offset += num_elements;
}
@@ -175,8 +178,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
return {dmat->Info().num_row_, page_.get(), gpair};
}
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
: page_(page), subsample_(subsample) {}
UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
: batch_param_{std::move(batch_param)}, subsample_(subsample) {}
GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) {
@@ -185,7 +188,8 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<std::size_t>(0),
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
return {dmat->Info().num_row_, page_, gpair};
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
return {dmat->Info().num_row_, page, gpair};
}
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
@@ -198,27 +202,27 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
common::Span<GradientPair> gpair,
DMatrix* dmat) {
auto cuctx = ctx->CUDACtx();
// Set gradient pair to 0 with p = 1 - subsample
thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0),
BernoulliTrial(common::GlobalRandom()(), subsample_),
GradientPair());
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<std::size_t>(0),
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair{});
// Count the sampled rows.
size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
size_t sample_rows =
thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero{});
// Compact gradient pairs.
gpair_.resize(sample_rows);
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero{});
// Index the sample rows.
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
IsNonZero());
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
sample_row_index_.begin());
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
sample_row_index_.begin(),
sample_row_index_.begin(),
ClearEmptyRows());
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
sample_row_index_.begin(), ClearEmptyRows());
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
auto first_page = (*batch_iterator.begin()).Impl();
@@ -228,7 +232,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
first_page->row_stride, sample_rows));
// Compact the ELLPACK pages into the single sample page.
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
for (auto& batch : batch_iterator) {
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
}
@@ -236,12 +240,10 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam&,
GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
float subsample)
: page_(page),
subsample_(subsample),
: subsample_(subsample),
batch_param_{std::move(batch_param)},
threshold_(n_rows + 1, 0.0f),
grad_sum_(n_rows, 0.0f) {}
@@ -252,18 +254,19 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
// Perform Poisson sampling in place.
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
RandomWeight(common::GlobalRandom()())));
return {n_rows, page_, gpair};
return {n_rows, page, gpair};
}
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
size_t n_rows,
BatchParam batch_param,
float subsample)
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
BatchParam batch_param,
float subsample)
: batch_param_(std::move(batch_param)),
subsample_(subsample),
threshold_(n_rows + 1, 0.0f),
@@ -273,16 +276,15 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
common::Span<GradientPair> gpair,
DMatrix* dmat) {
size_t n_rows = dmat->Info().num_row_;
auto cuctx = ctx->CUDACtx();
bst_row_t n_rows = dmat->Info().num_row_;
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
// Perform Poisson sampling in place.
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0),
dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_),
threshold_index,
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
RandomWeight(common::GlobalRandom()())));
// Count the sampled rows.
@@ -290,16 +292,15 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
// Compact gradient pairs.
gpair_.resize(sample_rows);
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
// Index the sample rows.
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
sample_row_index_.begin());
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
sample_row_index_.begin(),
sample_row_index_.begin(),
ClearEmptyRows());
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
IsNonZero());
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
sample_row_index_.begin());
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
sample_row_index_.begin(), ClearEmptyRows());
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
auto first_page = (*batch_iterator.begin()).Impl();
@@ -317,13 +318,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
size_t n_rows, const BatchParam& batch_param,
float subsample, int sampling_method) {
GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
const BatchParam& batch_param, float subsample,
int sampling_method, bool is_external_memory) {
// The ctx is kept here for future development of stream-based operations.
monitor_.Init("gradient_based_sampler");
bool is_sampling = subsample < 1.0;
bool is_external_memory = page->n_rows != n_rows;
if (is_sampling) {
switch (sampling_method) {
@@ -331,24 +332,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
if (is_external_memory) {
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new UniformSampling(page, subsample));
strategy_.reset(new UniformSampling(batch_param, subsample));
}
break;
case TrainParam::kGradientBased:
if (is_external_memory) {
strategy_.reset(
new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
}
break;
default:LOG(FATAL) << "unknown sampling method";
default:
LOG(FATAL) << "unknown sampling method";
}
} else {
if (is_external_memory) {
strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
} else {
strategy_.reset(new NoSampling(page));
strategy_.reset(new NoSampling(batch_param));
}
}
}
@@ -362,11 +363,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
return sample;
}
size_t GradientBasedSampler::CalculateThresholdIndex(
common::Span<GradientPair> gpair, common::Span<float> threshold,
common::Span<float> grad_sum, size_t sample_rows) {
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
std::numeric_limits<float>::max());
size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
common::Span<float> threshold,
common::Span<float> grad_sum,
size_t sample_rows) {
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
CombineGradientPair());
thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
@@ -379,6 +380,5 @@ size_t GradientBasedSampler::CalculateThresholdIndex(
thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
return thrust::distance(dh::tbegin(grad_sum), min) + 1;
}
}; // namespace tree
}; // namespace xgboost

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019 by XGBoost Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#pragma once
#include <xgboost/base.h>
@@ -32,37 +32,36 @@ class SamplingStrategy {
/*! \brief No sampling in in-memory mode. */
class NoSampling : public SamplingStrategy {
public:
explicit NoSampling(EllpackPageImpl const* page);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
EllpackPageImpl const* page_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
BatchParam batch_param);
explicit NoSampling(BatchParam batch_param);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
BatchParam batch_param_;
std::unique_ptr<EllpackPageImpl> page_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
explicit ExternalMemoryNoSampling(BatchParam batch_param);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
BatchParam batch_param_;
std::unique_ptr<EllpackPageImpl> page_{nullptr};
bool page_concatenated_{false};
};
/*! \brief Uniform sampling in in-memory mode. */
class UniformSampling : public SamplingStrategy {
public:
UniformSampling(EllpackPageImpl const* page, float subsample);
UniformSampling(BatchParam batch_param, float subsample);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
EllpackPageImpl const* page_;
BatchParam batch_param_;
float subsample_;
};
@@ -84,13 +83,12 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in in-memory mode.. */
class GradientBasedSampling : public SamplingStrategy {
public:
GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
float subsample);
GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
EllpackPageImpl const* page_;
BatchParam batch_param_;
float subsample_;
dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_;
@@ -106,11 +104,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
private:
BatchParam batch_param_;
float subsample_;
dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_;
dh::device_vector<float> threshold_;
dh::device_vector<float> grad_sum_;
std::unique_ptr<EllpackPageImpl> page_;
dh::device_vector<GradientPair> gpair_;
dh::caching_device_vector<size_t> sample_row_index_;
dh::device_vector<size_t> sample_row_index_;
};
/*! \brief Draw a sample of rows from a DMatrix.
@@ -124,8 +122,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
*/
class GradientBasedSampler {
public:
GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
const BatchParam& batch_param, float subsample, int sampling_method);
GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
float subsample, int sampling_method, bool is_external_memory);
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);

View File

@@ -8,6 +8,7 @@
#include <cstdint> // uint32_t
#include <limits>
#include "../../collective/aggregator.h"
#include "../../common/deterministic.cuh"
#include "../../common/device_helpers.cuh"
#include "../../data/ellpack_page.cuh"
@@ -52,7 +53,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
*
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
*/
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
using GradientSumT = GradientPairPrecise;
using T = typename GradientSumT::ValueT;
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -70,11 +71,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
// Treat pair as array of 4 primitive types to allreduce
using ReduceT = typename decltype(p.first)::ValueT;
static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
GradientPair positive_sum{p.first}, negative_sum{p.second};
std::size_t total_rows = gpair.size();
collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
collective::GlobalSum(info, &total_rows, 1);
auto histogram_rounding =
GradientSumT{common::CreateRoundingFactor<T>(

View File

@@ -39,7 +39,7 @@ private:
GradientPairPrecise to_floating_point_;
public:
explicit GradientQuantiser(common::Span<GradientPair const> gpair);
GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
gpair.GetHess() * to_fixed_point_.GetHess());

View File

@@ -24,21 +24,13 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaStreamCreate(&stream_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipStreamCreate(&stream_));
#endif
}
RowPartitioner::~RowPartitioner() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx_));
dh::safe_cuda(cudaStreamDestroy(stream_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx_));
dh::safe_cuda(hipStreamDestroy(stream_));
#endif
}

View File

@@ -116,13 +116,7 @@ template <typename RowIndexT, typename OpT, typename OpDataT>
void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
dh::device_vector<int8_t>* tmp,
#if defined(XGBOOST_USE_HIP)
hipStream_t stream
#elif defined(XGBOOST_USE_CUDA)
cudaStream_t stream
#endif
) {
dh::device_vector<int8_t>* tmp) {
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
d_counts.data()};
@@ -135,29 +129,28 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
int batch_idx;
std::size_t item_idx;
AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
auto op_res = op(ridx[item_idx], batch_idx, batch_info_itr[batch_idx].data);
return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
});
size_t temp_bytes = 0;
if (tmp->empty()) {
#if defined(XGBOOST_USE_CUDA)
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
IndexFlagOp(), total_rows, stream);
IndexFlagOp(), total_rows);
#elif defined(XGBOOST_USE_HIP)
rocprim::inclusive_scan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
total_rows, IndexFlagOp(), stream);
total_rows,IndexFlagOp());
#endif
tmp->resize(temp_bytes);
}
temp_bytes = tmp->size();
#if defined(XGBOOST_USE_CUDA)
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
discard_write_iterator, IndexFlagOp(), total_rows, stream);
discard_write_iterator, IndexFlagOp(), total_rows);
#elif defined(XGBOOST_USE_HIP)
rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator, discard_write_iterator,
total_rows, IndexFlagOp(), stream);
rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator,
discard_write_iterator, total_rows, IndexFlagOp());
#endif
constexpr int kBlockSize = 256;
@@ -167,7 +160,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
<<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
<<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
}
struct NodePositionInfo {
@@ -240,12 +233,6 @@ class RowPartitioner {
dh::PinnedMemory pinned_;
dh::PinnedMemory pinned2_;
#if defined(XGBOOST_USE_HIP)
hipStream_t stream_;
#else
cudaStream_t stream_;
#endif
public:
RowPartitioner(int device_idx, size_t num_rows);
~RowPartitioner();
@@ -303,11 +290,11 @@ class RowPartitioner {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
hipMemcpyDefault, stream_));
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
cudaMemcpyDefault, stream_));
cudaMemcpyDefault));
#endif
// Temporary arrays
@@ -317,23 +304,17 @@ class RowPartitioner {
// Partition the rows according to the operator
SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
total_rows, op, &tmp_, stream_);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
hipMemcpyDefault, stream_));
#else
total_rows, op, &tmp_);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
cudaMemcpyDefault, stream_));
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
hipMemcpyDefault));
#endif
// TODO(Rory): this synchronisation hurts performance a lot
// Future optimisation should find a way to skip this
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipStreamSynchronize(stream_));
#else
dh::safe_cuda(cudaStreamSynchronize(stream_));
#endif
dh::DefaultStream().Sync();
// Update segments
for (size_t i = 0; i < nidx.size(); i++) {
@@ -370,18 +351,18 @@ class RowPartitioner {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
sizeof(NodePositionInfo) * ridx_segments_.size(),
hipMemcpyDefault, stream_));
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
sizeof(NodePositionInfo) * ridx_segments_.size(),
cudaMemcpyDefault, stream_));
cudaMemcpyDefault));
#endif
constexpr int kBlockSize = 512;
const int kItemsThread = 8;
const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
}
};

View File

@@ -4,13 +4,13 @@
#ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
#include <algorithm> // for copy
#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <utility> // for move
#include <vector> // for vector
#include <algorithm> // for copy
#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <utility> // for move
#include <vector> // for vector
#include "../../common/categorical.h" // for CatBitField
#include "../../common/hist_util.h" // for GHistRow, HistogramCuts
@@ -20,6 +20,7 @@
#include "../param.h" // for TrainParam
#include "../split_evaluator.h" // for TreeEvaluator
#include "expand_entry.h" // for MultiExpandEntry
#include "hist_cache.h" // for BoundedHistCollection
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_feature_t
#include "xgboost/context.h" // for COntext
#include "xgboost/linalg.h" // for Constants, Vector
@@ -65,7 +66,7 @@ class HistEvaluator {
* pseudo-category for missing value but here we just do a complete scan to avoid
* making specialized histogram bin.
*/
void EnumerateOneHot(common::HistogramCuts const &cut, const common::GHistRow &hist,
void EnumerateOneHot(common::HistogramCuts const &cut, common::ConstGHistRow hist,
bst_feature_t fidx, bst_node_t nidx,
TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
SplitEntry *p_best) const {
@@ -143,7 +144,7 @@ class HistEvaluator {
*/
template <int d_step>
void EnumeratePart(common::HistogramCuts const &cut, common::Span<size_t const> sorted_idx,
common::GHistRow const &hist, bst_feature_t fidx, bst_node_t nidx,
common::ConstGHistRow hist, bst_feature_t fidx, bst_node_t nidx,
TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
SplitEntry *p_best) {
static_assert(d_step == +1 || d_step == -1, "Invalid step.");
@@ -214,7 +215,7 @@ class HistEvaluator {
// Returns the sum of gradients corresponding to the data points that contains
// a non-missing value for the particular feature fid.
template <int d_step>
GradStats EnumerateSplit(common::HistogramCuts const &cut, const common::GHistRow &hist,
GradStats EnumerateSplit(common::HistogramCuts const &cut, common::ConstGHistRow hist,
bst_feature_t fidx, bst_node_t nidx,
TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
SplitEntry *p_best) const {
@@ -317,7 +318,7 @@ class HistEvaluator {
}
public:
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
common::Span<FeatureType const> feature_types, const RegTree &tree,
std::vector<CPUExpandEntry> *p_entries) {
auto n_threads = ctx_->Threads();
@@ -454,8 +455,8 @@ class HistEvaluator {
right_child);
}
auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
auto const& Stats() const { return snode_; }
[[nodiscard]] auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
[[nodiscard]] auto const &Stats() const { return snode_; }
float InitRoot(GradStats const &root_sum) {
snode_.resize(1);
@@ -510,7 +511,7 @@ class HistMultiEvaluator {
template <bst_bin_t d_step>
bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
common::Span<common::GHistRow const> hist,
common::Span<common::ConstGHistRow> hist,
linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
auto const &cut_ptr = cut.Ptrs();
@@ -623,7 +624,7 @@ class HistMultiEvaluator {
}
public:
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
auto &entries = *p_entries;
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
@@ -651,9 +652,9 @@ class HistMultiEvaluator {
auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
auto best = &entry->split;
auto parent_sum = stats_.Slice(entry->nid, linalg::All());
std::vector<common::GHistRow> node_hist;
std::vector<common::ConstGHistRow> node_hist;
for (auto t_hist : hist) {
node_hist.push_back((*t_hist)[entry->nid]);
node_hist.emplace_back((*t_hist)[entry->nid]);
}
auto features_set = features[nidx_in_set]->ConstHostSpan();
@@ -773,7 +774,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
std::vector<Partitioner> const &partitioner,
linalg::VectorView<float> out_preds) {
auto const &tree = *p_last_tree;
CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
CHECK(out_preds.Device().IsCPU());
size_t n_nodes = p_last_tree->GetNodes().size();
for (auto &part : partitioner) {
CHECK_EQ(part.Size(), n_nodes);
@@ -808,7 +809,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
auto n_nodes = mttree->Size();
auto n_targets = tree.NumTargets();
CHECK_EQ(out_preds.Shape(1), n_targets);
CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
CHECK(out_preds.Device().IsCPU());
for (auto &part : partitioner) {
CHECK_EQ(part.Size(), n_nodes);

View File

@@ -18,8 +18,8 @@ namespace xgboost::tree {
*/
template <typename Impl>
struct ExpandEntryImpl {
bst_node_t nid;
bst_node_t depth;
bst_node_t nid{0};
bst_node_t depth{0};
[[nodiscard]] float GetLossChange() const {
return static_cast<Impl const*>(this)->split.loss_chg;

113
src/tree/hist/hist_cache.h Normal file
View File

@@ -0,0 +1,113 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
#include <cstddef> // for size_t
#include <map> // for map
#include <memory> // for unique_ptr
#include <vector> // for vector
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "../../common/ref_resource_view.h" // for ReallocVector
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span
namespace xgboost::tree {
/**
* @brief A persistent cache for CPU histogram.
*
* The size of the cache is first bounded by the `Driver` class then by this cache
* implementaiton. The former limits the number of nodes that can be built for each node
* batch, while this cache limits the number of all nodes up to the size of
* max(|node_batch|, n_cached_node).
*
* The caller is responsible for clearing up the cache as it needs to rearrange the
* nodes before making overflowed allocations. The strcut only reports whether the size
* limit has benn reached.
*/
class BoundedHistCollection {
// maps node index to offset in `data_`.
std::map<bst_node_t, std::size_t> node_map_;
// currently allocated bins, used for tracking consistentcy.
std::size_t current_size_{0};
// stores the histograms in a contiguous buffer
using Vec = common::ReallocVector<GradientPairPrecise>;
std::unique_ptr<Vec> data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique
// number of histogram bins across all features
bst_bin_t n_total_bins_{0};
// limits the number of nodes that can be in the cache for each tree
std::size_t n_cached_nodes_{0};
// whether the tree has grown beyond the cache limit
bool has_exceeded_{false};
public:
BoundedHistCollection() = default;
common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx);
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
}
common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx);
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
}
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins;
n_cached_nodes_ = n_cached_nodes;
this->Clear(false);
}
/**
* @brief Clear the cache, mark whether the cache is exceeded the limit.
*/
void Clear(bool exceeded) {
node_map_.clear();
current_size_ = 0;
has_exceeded_ = exceeded;
}
[[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
common::Span<bst_node_t const> nodes_to_sub) const {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
return n_new_nodes + node_map_.size() <= n_cached_nodes_;
}
/**
* @brief Allocate histogram buffers for all nodes.
*
* The resulting histogram buffer is contiguous for all nodes in the order of
* allocation.
*/
void AllocateHistograms(common::Span<bst_node_t const> nodes_to_build,
common::Span<bst_node_t const> nodes_to_sub) {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
auto alloc_size = n_new_nodes * n_total_bins_;
auto new_size = alloc_size + current_size_;
if (new_size > data_->size()) {
data_->Resize(new_size);
}
for (auto nidx : nodes_to_build) {
node_map_[nidx] = current_size_;
current_size_ += n_total_bins_;
}
for (auto nidx : nodes_to_sub) {
node_map_[nidx] = current_size_;
current_size_ += n_total_bins_;
}
CHECK_EQ(current_size_, new_size);
}
void AllocateHistograms(std::vector<bst_node_t> const& nodes) {
this->AllocateHistograms(common::Span<bst_node_t const>{nodes},
common::Span<bst_node_t const>{});
}
[[nodiscard]] bool HasExceeded() const { return has_exceeded_; }
[[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
return node_map_.find(nidx) != node_map_.cend();
}
[[nodiscard]] std::size_t Size() const { return current_size_; }
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_HIST_CACHE_H_

View File

@@ -0,0 +1,63 @@
/**
* Copyright 2023 by XGBoost Contributors
*/
#include "histogram.h"
#include <cstddef> // for size_t
#include <numeric> // for accumulate
#include <utility> // for swap
#include <vector> // for vector
#include "../../common/transform_iterator.h" // for MakeIndexTransformIter
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "xgboost/logging.h" // for CHECK_NE
#include "xgboost/span.h" // for Span
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
CHECK_EQ(nodes_to_build.size(), valid_candidates.size());
std::size_t n_idx = 0;
for (auto const &c : valid_candidates) {
auto left_nidx = p_tree->LeftChild(c.nid);
auto right_nidx = p_tree->RightChild(c.nid);
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
auto lit =
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
auto rit =
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
auto fewer_right = right_sum < left_sum;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = build_nidx;
nodes_to_sub[n_idx] = subtract_nidx;
++n_idx;
}
}
void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
std::size_t n_idx = 0;
for (auto const &c : candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = build_nidx;
nodes_to_sub[n_idx] = subtract_nidx;
++n_idx;
}
}
} // namespace xgboost::tree

View File

@@ -4,316 +4,229 @@
#ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
#define XGBOOST_TREE_HIST_HISTOGRAM_H_
#include <algorithm>
#include <limits>
#include <vector>
#include <algorithm> // for max
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <functional> // for function
#include <utility> // for move
#include <vector> // for vector
#include "../../collective/communicator-inl.h"
#include "../../common/hist_util.h"
#include "../../data/gradient_index.h"
#include "expand_entry.h"
#include "xgboost/tree_model.h" // for RegTree
#include "../../collective/communicator-inl.h" // for Allreduce
#include "../../collective/communicator.h" // for Operation
#include "../../common/hist_util.h" // for GHistRow, ParallelGHi...
#include "../../common/row_set.h" // for RowSetCollection
#include "../../common/threading_utils.h" // for ParallelFor2d, Range1d, BlockedSpace2d
#include "../../data/gradient_index.h" // for GHistIndexMatrix
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "hist_cache.h" // for BoundedHistCollection
#include "param.h" // for HistMakerTrainParam
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for BatchIterator, BatchSet
#include "xgboost/linalg.h" // for MatrixView, All, Vect...
#include "xgboost/logging.h" // for CHECK_GE
#include "xgboost/span.h" // for Span
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
/**
* @brief Decide which node as the build node for multi-target trees.
*/
void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
/**
* @brief Decide which node as the build node.
*/
void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
namespace xgboost {
namespace tree {
template <typename ExpandEntry>
class HistogramBuilder {
/*! \brief culmulative histogram of gradients. */
common::HistCollection hist_;
/*! \brief culmulative local parent histogram of gradients. */
common::HistCollection hist_local_worker_;
common::GHistBuilder builder_;
BoundedHistCollection hist_;
common::ParallelGHistBuilder buffer_;
BatchParam param_;
int32_t n_threads_{-1};
size_t n_batches_{0};
// Whether XGBoost is running in distributed environment.
bool is_distributed_{false};
bool is_col_split_{false};
public:
/**
* \param total_bins Total number of bins across all features
* \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
* training parameter.
* \param n_threads Number of threads.
* \param is_distributed Mostly used for testing to allow injecting parameters instead
* @brief Reset the builder, should be called before growing a new tree.
*
* @param total_bins Total number of bins across all features
* @param is_distributed Mostly used for testing to allow injecting parameters instead
* of using global rabit variable.
*/
void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
bool is_distributed, bool is_col_split) {
CHECK_GE(n_threads, 1);
n_threads_ = n_threads;
n_batches_ = n_batches;
void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed,
bool is_col_split, HistMakerTrainParam const *param) {
n_threads_ = ctx->Threads();
param_ = p;
hist_.Init(total_bins);
hist_local_worker_.Init(total_bins);
hist_.Reset(total_bins, param->max_cached_hist_node);
buffer_.Init(total_bins);
builder_ = common::GHistBuilder(total_bins);
is_distributed_ = is_distributed;
is_col_split_ = is_col_split;
// Workaround s390x gcc 7.5.0
auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
}
template <bool any_missing>
void BuildLocalHistograms(size_t page_idx, common::BlockedSpace2d space,
GHistIndexMatrix const &gidx,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
void BuildLocalHistograms(common::BlockedSpace2d const &space, GHistIndexMatrix const &gidx,
std::vector<bst_node_t> const &nodes_to_build,
common::RowSetCollection const &row_set_collection,
common::Span<GradientPair const> gpair_h, bool force_read_by_column) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
CHECK_GT(n_nodes, 0);
std::vector<common::GHistRow> target_hists(n_nodes);
for (size_t i = 0; i < n_nodes; ++i) {
auto const nidx = nodes_for_explicit_hist_build[i].nid;
target_hists[i] = hist_[nidx];
}
if (page_idx == 0) {
// FIXME(jiamingy): Handle different size of space. Right now we use the maximum
// partition size for the buffer, which might not be efficient if partition sizes
// has significant variance.
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
}
// Parallel processing by nodes and data in each node
common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
const auto tid = static_cast<unsigned>(omp_get_thread_num());
const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
auto elem = row_set_collection[nid];
bst_node_t const nidx = nodes_to_build[nid_in_set];
auto elem = row_set_collection[nidx];
auto start_of_row_set = std::min(r.begin(), elem.Size());
auto end_of_row_set = std::min(r.end(), elem.Size());
auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
elem.begin + end_of_row_set, nid);
elem.begin + end_of_row_set, nidx);
auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
if (rid_set.Size() != 0) {
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
force_read_by_column);
common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
}
});
}
void AddHistRows(int *starting_index, int *sync_count,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
RegTree const *p_tree) {
if (is_distributed_ && !is_col_split_) {
this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);
} else {
this->AddHistRowsLocal(starting_index, sync_count, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick);
}
}
/**
* @brief Allocate histogram, rearrange the nodes if `rearrange` is true and the tree
* has reached the cache size limit.
*/
void AddHistRows(RegTree const *p_tree, std::vector<bst_node_t> *p_nodes_to_build,
std::vector<bst_node_t> *p_nodes_to_sub, bool rearrange) {
CHECK(p_nodes_to_build);
auto &nodes_to_build = *p_nodes_to_build;
CHECK(p_nodes_to_sub);
auto &nodes_to_sub = *p_nodes_to_sub;
/** Main entry point of this class, build histogram for tree nodes. */
void BuildHist(size_t page_id, common::BlockedSpace2d space, GHistIndexMatrix const &gidx,
RegTree const *p_tree, common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
if (page_id == 0) {
this->AddHistRows(&starting_index, &sync_count, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, p_tree);
}
if (gidx.IsDense()) {
this->BuildLocalHistograms<false>(page_id, space, gidx, nodes_for_explicit_hist_build,
row_set_collection, gpair, force_read_by_column);
} else {
this->BuildLocalHistograms<true>(page_id, space, gidx, nodes_for_explicit_hist_build,
row_set_collection, gpair, force_read_by_column);
// We first check whether the cache size is already exceeded or about to be exceeded.
// If not, then we can allocate histograms without clearing the cache and without
// worrying about missing parent histogram.
//
// Otherwise, we need to rearrange the nodes before the allocation to make sure the
// resulting buffer is contiguous. This is to facilitate efficient allreduce.
bool can_host = this->hist_.CanHost(nodes_to_build, nodes_to_sub);
// True if the tree is still within the size of cache limit. Allocate histogram as
// usual.
auto cache_is_valid = can_host && !this->hist_.HasExceeded();
if (!can_host) {
this->hist_.Clear(true);
}
CHECK_GE(n_batches_, 1);
if (page_id != n_batches_ - 1) {
if (!rearrange || cache_is_valid) {
// If not rearrange, we allocate the histogram as usual, assuming the nodes have
// been properly arranged by other builders.
this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
if (rearrange) {
CHECK(!this->hist_.HasExceeded());
}
return;
}
if (is_distributed_ && !is_col_split_) {
this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick,
starting_index, sync_count);
// The cache is full, parent histogram might be removed in previous iterations to
// saved memory.
std::vector<bst_node_t> can_subtract;
for (auto const &v : nodes_to_sub) {
if (this->hist_.HistogramExists(p_tree->Parent(v))) {
// We can still use the subtraction trick for this node
can_subtract.push_back(v);
} else {
// This node requires a full build
nodes_to_build.push_back(v);
}
}
nodes_to_sub = std::move(can_subtract);
this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
}
/** Main entry point of this class, build histogram for tree nodes. */
void BuildHist(std::size_t page_idx, common::BlockedSpace2d const &space,
GHistIndexMatrix const &gidx, common::RowSetCollection const &row_set_collection,
std::vector<bst_node_t> const &nodes_to_build,
linalg::VectorView<GradientPair const> gpair, bool force_read_by_column = false) {
CHECK(gpair.Contiguous());
if (page_idx == 0) {
// Add the local histogram cache to the parallel buffer before processing the first page.
auto n_nodes = nodes_to_build.size();
std::vector<common::GHistRow> target_hists(n_nodes);
for (size_t i = 0; i < n_nodes; ++i) {
auto const nidx = nodes_to_build[i];
target_hists[i] = hist_[nidx];
}
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
}
if (gidx.IsDense()) {
this->BuildLocalHistograms<false>(space, gidx, nodes_to_build, row_set_collection,
gpair.Values(), force_read_by_column);
} else {
this->SyncHistogramLocal(p_tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
this->BuildLocalHistograms<true>(space, gidx, nodes_to_build, row_set_collection,
gpair.Values(), force_read_by_column);
}
}
/** same as the other build hist but handles only single batch data (in-core) */
void BuildHist(size_t page_id, GHistIndexMatrix const &gidx, RegTree *p_tree,
common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
// create space of size (# rows in each node)
void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
std::vector<bst_node_t> const &nodes_to_trick) {
auto n_total_bins = buffer_.TotalBins();
common::BlockedSpace2d space(
n_nodes,
[&](size_t nidx_in_set) {
const int32_t nidx = nodes_for_explicit_hist_build[nidx_in_set].nid;
return row_set_collection[nidx].Size();
},
256);
this->BuildHist(page_id, space, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build,
nodes_for_subtraction_trick, gpair, force_read_by_column);
}
void SyncHistogramDistributed(RegTree const *p_tree,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
int starting_index, int sync_count) {
const size_t nbins = builder_.GetNumBins();
common::BlockedSpace2d space(
nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
common::ParallelFor2d(space, n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes_for_explicit_hist_build[node];
auto this_hist = this->hist_[entry.nid];
// Merging histograms from each thread into once
buffer_.ReduceHist(node, r.begin(), r.end());
// Store posible parent node
auto this_local = hist_local_worker_[entry.nid];
common::CopyHist(this_local, this_hist, r.begin(), r.end());
if (!p_tree->IsRoot(entry.nid)) {
const size_t parent_id = p_tree->Parent(entry.nid);
const int subtraction_node_id = nodes_for_subtraction_trick[node].nid;
auto parent_hist = this->hist_local_worker_[parent_id];
auto sibling_hist = this->hist_[subtraction_node_id];
common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
// Store posible parent node
auto sibling_local = hist_local_worker_[subtraction_node_id];
common::CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
}
});
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(this->hist_[starting_index].data()),
builder_.GetNumBins() * sync_count * 2);
ParallelSubtractionHist(space, nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
p_tree);
common::BlockedSpace2d space2(
nodes_for_subtraction_trick.size(), [&](size_t) { return nbins; }, 1024);
ParallelSubtractionHist(space2, nodes_for_subtraction_trick, nodes_for_explicit_hist_build,
p_tree);
}
void SyncHistogramLocal(RegTree const *p_tree,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
const size_t nbins = this->builder_.GetNumBins();
common::BlockedSpace2d space(
nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes_for_explicit_hist_build[node];
auto this_hist = this->hist_[entry.nid];
// Merging histograms from each thread into once
// Merging histograms from each thread.
this->buffer_.ReduceHist(node, r.begin(), r.end());
if (!p_tree->IsRoot(entry.nid)) {
auto const parent_id = p_tree->Parent(entry.nid);
auto const subtraction_node_id = nodes_for_subtraction_trick[node].nid;
auto parent_hist = this->hist_[parent_id];
auto sibling_hist = this->hist_[subtraction_node_id];
common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
}
});
if (is_distributed_ && !is_col_split_) {
// The cache is contiguous, we can perform allreduce for all nodes in one go.
CHECK(!nodes_to_build.empty());
auto first_nidx = nodes_to_build.front();
std::size_t n = n_total_bins * nodes_to_build.size() * 2;
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
}
common::BlockedSpace2d const &subspace =
nodes_to_trick.size() == nodes_to_build.size()
? space
: common::BlockedSpace2d{nodes_to_trick.size(),
[&](std::size_t) { return n_total_bins; }, 1024};
common::ParallelFor2d(
subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
auto subtraction_nidx = nodes_to_trick[nidx_in_set];
auto parent_id = p_tree->Parent(subtraction_nidx);
auto sibling_nidx = p_tree->IsLeftChild(subtraction_nidx) ? p_tree->RightChild(parent_id)
: p_tree->LeftChild(parent_id);
auto sibling_hist = this->hist_[sibling_nidx];
auto parent_hist = this->hist_[parent_id];
auto subtract_hist = this->hist_[subtraction_nidx];
common::SubtractionHist(subtract_hist, parent_hist, sibling_hist, r.begin(), r.end());
});
}
public:
/* Getters for tests. */
common::HistCollection const &Histogram() { return hist_; }
auto& Buffer() { return buffer_; }
private:
void
ParallelSubtractionHist(const common::BlockedSpace2d &space,
const std::vector<ExpandEntry> &nodes,
const std::vector<ExpandEntry> &subtraction_nodes,
const RegTree *p_tree) {
common::ParallelFor2d(
space, this->n_threads_, [&](size_t node, common::Range1d r) {
const auto &entry = nodes[node];
if (!(p_tree->IsLeftChild(entry.nid))) {
auto this_hist = this->hist_[entry.nid];
if (!p_tree->IsRoot(entry.nid)) {
const int subtraction_node_id = subtraction_nodes[node].nid;
auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()];
auto sibling_hist = hist_[subtraction_node_id];
common::SubtractionHist(this_hist, parent_hist, sibling_hist,
r.begin(), r.end());
}
}
});
}
// Add a tree node to histogram buffer in local training environment.
void AddHistRowsLocal(
int *starting_index, int *sync_count,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
for (auto const &entry : nodes_for_explicit_hist_build) {
int nid = entry.nid;
this->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
}
(*sync_count) = nodes_for_explicit_hist_build.size();
for (auto const &node : nodes_for_subtraction_trick) {
this->hist_.AddHistRow(node.nid);
}
this->hist_.AllocateAllData();
}
void AddHistRowsDistributed(int *starting_index, int *sync_count,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
RegTree const *p_tree) {
const size_t explicit_size = nodes_for_explicit_hist_build.size();
const size_t subtaction_size = nodes_for_subtraction_trick.size();
std::vector<int> merged_node_ids(explicit_size + subtaction_size);
for (size_t i = 0; i < explicit_size; ++i) {
merged_node_ids[i] = nodes_for_explicit_hist_build[i].nid;
}
for (size_t i = 0; i < subtaction_size; ++i) {
merged_node_ids[explicit_size + i] = nodes_for_subtraction_trick[i].nid;
}
std::sort(merged_node_ids.begin(), merged_node_ids.end());
int n_left = 0;
for (auto const &nid : merged_node_ids) {
if (p_tree->IsLeftChild(nid)) {
this->hist_.AddHistRow(nid);
(*starting_index) = std::min(nid, (*starting_index));
n_left++;
this->hist_local_worker_.AddHistRow(nid);
}
}
for (auto const &nid : merged_node_ids) {
if (!(p_tree->IsLeftChild(nid))) {
this->hist_.AddHistRow(nid);
this->hist_local_worker_.AddHistRow(nid);
}
}
this->hist_.AllocateAllData();
this->hist_local_worker_.AllocateAllData();
(*sync_count) = std::max(1, n_left);
}
[[nodiscard]] BoundedHistCollection const &Histogram() const { return hist_; }
[[nodiscard]] BoundedHistCollection &Histogram() { return hist_; }
auto &Buffer() { return buffer_; }
};
// Construct a work space for building histogram. Eventually we should move this
// function into histogram builder once hist tree method supports external memory.
template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
template <typename Partitioner>
common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
std::vector<ExpandEntry> const &nodes_to_build) {
std::vector<size_t> partition_size(nodes_to_build.size(), 0);
std::vector<bst_node_t> const &nodes_to_build) {
// FIXME(jiamingy): Handle different size of space. Right now we use the maximum
// partition size for the buffer, which might not be efficient if partition sizes
// has significant variance.
std::vector<std::size_t> partition_size(nodes_to_build.size(), 0);
for (auto const &partition : partitioners) {
size_t k = 0;
for (auto node : nodes_to_build) {
auto n_rows_in_node = partition.Partitions()[node.nid].Size();
for (auto nidx : nodes_to_build) {
auto n_rows_in_node = partition.Partitions()[nidx].Size();
partition_size[k] = std::max(partition_size[k], n_rows_in_node);
k++;
}
@@ -322,6 +235,107 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
return space;
}
} // namespace tree
} // namespace xgboost
/**
* @brief Histogram builder that can handle multiple targets.
*/
class MultiHistogramBuilder {
std::vector<HistogramBuilder> target_builders_;
Context const *ctx_;
public:
/**
* @brief Build the histogram for root node.
*/
template <typename Partitioner, typename ExpandEntry>
void BuildRootHist(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<Partitioner> const &partitioners,
linalg::MatrixView<GradientPair const> gpair, ExpandEntry const &best,
BatchParam const &param, bool force_read_by_column = false) {
auto n_targets = p_tree->NumTargets();
CHECK_EQ(gpair.Shape(1), n_targets);
CHECK_EQ(p_fmat->Info().num_row_, gpair.Shape(0));
CHECK_EQ(target_builders_.size(), n_targets);
std::vector<bst_node_t> nodes{best.nid};
std::vector<bst_node_t> dummy_sub;
auto space = ConstructHistSpace(partitioners, nodes);
for (bst_target_t t{0}; t < n_targets; ++t) {
this->target_builders_[t].AddHistRows(p_tree, &nodes, &dummy_sub, false);
}
CHECK(dummy_sub.empty());
std::size_t page_idx{0};
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
for (bst_target_t t{0}; t < n_targets; ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
this->target_builders_[t].BuildHist(page_idx, space, gidx,
partitioners[page_idx].Partitions(), nodes, t_gpair,
force_read_by_column);
}
++page_idx;
}
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
}
}
/**
* @brief Build histogram for left and right child of valid candidates
*/
template <typename Partitioner, typename ExpandEntry>
void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<Partitioner> const &partitioners,
std::vector<ExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
bool force_read_by_column = false) {
std::vector<bst_node_t> nodes_to_build(valid_candidates.size());
std::vector<bst_node_t> nodes_to_sub(valid_candidates.size());
AssignNodes(p_tree, valid_candidates, nodes_to_build, nodes_to_sub);
// use the first builder for getting number of valid nodes.
target_builders_.front().AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, true);
CHECK_GE(nodes_to_build.size(), nodes_to_sub.size());
CHECK_EQ(nodes_to_sub.size() + nodes_to_build.size(), valid_candidates.size() * 2);
// allocate storage for the rest of the builders
for (bst_target_t t = 1; t < target_builders_.size(); ++t) {
target_builders_[t].AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, false);
}
auto space = ConstructHistSpace(partitioners, nodes_to_build);
std::size_t page_idx{0};
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
CHECK_EQ(gpair.Shape(1), p_tree->NumTargets());
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
CHECK_EQ(t_gpair.Shape(0), p_fmat->Info().num_row_);
this->target_builders_[t].BuildHist(page_idx, space, page,
partitioners[page_idx].Partitions(), nodes_to_build,
t_gpair, force_read_by_column);
}
page_idx++;
}
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
}
}
[[nodiscard]] auto const &Histogram(bst_target_t t) const {
return target_builders_[t].Histogram();
}
[[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); }
void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p,
bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) {
ctx_ = ctx;
target_builders_.resize(n_targets);
CHECK_GE(n_targets, 1);
for (auto &v : target_builders_) {
v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param);
}
}
};
} // namespace xgboost::tree
#endif // XGBOOST_TREE_HIST_HISTOGRAM_H_

34
src/tree/hist/param.cc Normal file
View File

@@ -0,0 +1,34 @@
/**
* Copyright 2021-2023, XGBoost Contributors
*/
#include "param.h"
#include <string> // for string
#include "../../collective/communicator-inl.h" // for GetRank, Broadcast
#include "xgboost/json.h" // for Object, Json
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
if (!this->debug_synchronize) {
return;
}
std::string s_model;
Json model{Object{}};
int rank = collective::GetRank();
if (rank == 0) {
local_tree->SaveModel(&model);
}
Json::Dump(model, &s_model, std::ios::binary);
collective::Broadcast(&s_model, 0);
RegTree ref_tree{}; // rank 0 tree
auto j_ref_tree = Json::Load(StringView{s_model}, std::ios::binary);
ref_tree.LoadModel(j_ref_tree);
CHECK(*local_tree == ref_tree);
}
} // namespace xgboost::tree

31
src/tree/hist/param.h Normal file
View File

@@ -0,0 +1,31 @@
/**
* Copyright 2021-2023, XGBoost Contributors
*/
#pragma once
#include <cstddef> // for size_t
#include "xgboost/parameter.h" // for XGBoostParameter
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost::tree {
struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
bool debug_synchronize{false};
std::size_t max_cached_hist_node{DefaultNodes()};
void CheckTreesSynchronized(RegTree const* local_tree) const;
// declare parameters
DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
DMLC_DECLARE_FIELD(debug_synchronize)
.set_default(false)
.describe("Check if all distributed tree are identical after tree construction.");
DMLC_DECLARE_FIELD(max_cached_hist_node)
.set_default(DefaultNodes())
.set_lower_bound(1)
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
}
};
} // namespace xgboost::tree

View File

@@ -526,7 +526,7 @@ struct SplitEntryContainer {
* \return whether the proposed split is better and can replace current split
*/
template <typename GradientSumT>
bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
bool Update(bst_float new_loss_chg, bst_feature_t split_index, float new_split_value,
bool default_left, bool is_cat, GradientSumT const &left_sum,
GradientSumT const &right_sum) {
if (this->NeedReplace(new_loss_chg, split_index)) {

View File

@@ -213,7 +213,7 @@ std::vector<bst_cat_t> GetSplitCategories(RegTree const &tree, int32_t nidx) {
auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};
std::vector<bst_cat_t> cats;
for (size_t i = 0; i < split.Size(); ++i) {
for (size_t i = 0; i < split.Capacity(); ++i) {
if (split.Check(i)) {
cats.push_back(static_cast<bst_cat_t>(i));
}
@@ -398,11 +398,14 @@ class JsonGenerator : public TreeGenerator {
static std::string const kIndicatorTemplate =
R"ID( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", "yes": {yes}, "no": {no})ID";
auto split_index = tree[nid].SplitIndex();
auto fname = fmap_.Name(split_index);
std::string qfname; // quoted
common::EscapeU8(fname, &qfname);
auto result = SuperT::Match(
kIndicatorTemplate,
{{"{nid}", std::to_string(nid)},
{"{depth}", std::to_string(depth)},
{"{fname}", fmap_.Name(split_index)},
{"{fname}", qfname},
{"{yes}", std::to_string(nyes)},
{"{no}", std::to_string(tree[nid].DefaultChild())}});
return result;
@@ -430,12 +433,14 @@ class JsonGenerator : public TreeGenerator {
std::string const &template_str, std::string cond,
uint32_t depth) const {
auto split_index = tree[nid].SplitIndex();
auto fname = split_index < fmap_.Size() ? fmap_.Name(split_index) : std::to_string(split_index);
std::string qfname; // quoted
common::EscapeU8(fname, &qfname);
std::string const result = SuperT::Match(
template_str,
{{"{nid}", std::to_string(nid)},
{"{depth}", std::to_string(depth)},
{"{fname}", split_index < fmap_.Size() ? fmap_.Name(split_index) :
std::to_string(split_index)},
{"{fname}", qfname},
{"{cond}", cond},
{"{left}", std::to_string(tree[nid].LeftChild())},
{"{right}", std::to_string(tree[nid].RightChild())},
@@ -1004,7 +1009,7 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
auto segment = split_categories_segments_[i];
auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
common::KCatBitField const cat_bits(node_categories);
for (size_t i = 0; i < cat_bits.Size(); ++i) {
for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
if (cat_bits.Check(i)) {
categories.GetArray().emplace_back(i);
}

View File

@@ -3,27 +3,39 @@
*
* \brief Implementation for the approx tree method.
*/
#include <algorithm>
#include <memory>
#include <vector>
#include <algorithm> // for max, transform, fill_n
#include <cstddef> // for size_t
#include <map> // for map
#include <memory> // for allocator, unique_ptr, make_shared, make_unique
#include <utility> // for move
#include <vector> // for vector
#include "../collective/aggregator.h"
#include "../common/random.h"
#include "../data/gradient_index.h"
#include "common_row_partitioner.h"
#include "constraints.h"
#include "driver.h"
#include "hist/evaluate_splits.h"
#include "hist/histogram.h"
#include "hist/sampler.h" // for SampleGradient
#include "param.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/json.h"
#include "xgboost/linalg.h"
#include "xgboost/task.h" // for ObjInfo
#include "xgboost/tree_model.h"
#include "xgboost/tree_updater.h" // for TreeUpdater
#include "../collective/aggregator.h" // for GlobalSum
#include "../collective/communicator-inl.h" // for IsDistributed
#include "../common/hist_util.h" // for HistogramCuts
#include "../common/random.h" // for ColumnSampler
#include "../common/timer.h" // for Monitor
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "common_row_partitioner.h" // for CommonRowPartitioner
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
#include "driver.h" // for Driver
#include "hist/evaluate_splits.h" // for HistEvaluator, UpdatePredictionCacheImpl
#include "hist/expand_entry.h" // for CPUExpandEntry
#include "hist/histogram.h" // for MultiHistogramBuilder
#include "hist/param.h" // for HistMakerTrainParam
#include "hist/sampler.h" // for SampleGradient
#include "param.h" // for GradStats, TrainParam
#include "xgboost/base.h" // for Args, GradientPair, bst_node_t, bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, BatchSet, BatchIterator, MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Object, Json, FromJson, ToJson, get
#include "xgboost/linalg.h" // for Matrix, MakeTensorView, Empty, MatrixView
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK
#include "xgboost/span.h" // for Span
#include "xgboost/task.h" // for ObjInfo
#include "xgboost/tree_model.h" // for RegTree, RTreeNodeStat
#include "xgboost/tree_updater.h" // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
namespace xgboost::tree {
@@ -43,9 +55,10 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
class GloablApproxBuilder {
protected:
TrainParam const *param_;
HistMakerTrainParam const *hist_param_{nullptr};
std::shared_ptr<common::ColumnSampler> col_sampler_;
HistEvaluator evaluator_;
HistogramBuilder<CPUExpandEntry> histogram_builder_;
MultiHistogramBuilder histogram_builder_;
Context const *ctx_;
ObjInfo const *const task_;
@@ -58,7 +71,7 @@ class GloablApproxBuilder {
common::HistogramCuts feature_values_;
public:
void InitData(DMatrix *p_fmat, common::Span<float> hess) {
void InitData(DMatrix *p_fmat, RegTree const *p_tree, common::Span<float> hess) {
monitor_->Start(__func__);
n_batches_ = 0;
@@ -78,8 +91,9 @@ class GloablApproxBuilder {
n_batches_++;
}
histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess),
collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
hist_param_);
monitor_->Stop(__func__);
}
@@ -95,20 +109,16 @@ class GloablApproxBuilder {
}
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
std::vector<CPUExpandEntry> nodes{best};
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
{}, gpair);
i++;
}
this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
best, BatchSpec(*param_, hess));
auto weight = evaluator_.InitRoot(root_sum);
p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
p_tree->Stat(RegTree::kRoot).base_weight = weight;
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
auto const &histograms = histogram_builder_.Histogram();
auto const &histograms = histogram_builder_.Histogram(0);
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &nodes);
monitor_->Stop(__func__);
@@ -129,30 +139,9 @@ class GloablApproxBuilder {
std::vector<CPUExpandEntry> const &valid_candidates,
std::vector<GradientPair> const &gpair, common::Span<float> hess) {
monitor_->Start(__func__);
std::vector<CPUExpandEntry> nodes_to_build;
std::vector<CPUExpandEntry> nodes_to_sub;
for (auto const &c : valid_candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build.push_back(CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}});
nodes_to_sub.push_back(CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}});
}
size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes_to_build, nodes_to_sub, gpair);
i++;
}
this->histogram_builder_.BuildHistLeftRight(
p_fmat, p_tree, partitioner_, valid_candidates,
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
monitor_->Stop(__func__);
}
@@ -169,10 +158,12 @@ class GloablApproxBuilder {
}
public:
explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
explicit GloablApproxBuilder(TrainParam const *param, HistMakerTrainParam const *hist_param,
MetaInfo const &info, Context const *ctx,
std::shared_ptr<common::ColumnSampler> column_sampler,
ObjInfo const *task, common::Monitor *monitor)
: param_{param},
hist_param_{hist_param},
col_sampler_{std::move(column_sampler)},
evaluator_{ctx, param_, info, col_sampler_},
ctx_{ctx},
@@ -182,7 +173,7 @@ class GloablApproxBuilder {
void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
p_last_tree_ = p_tree;
this->InitData(p_fmat, hess);
this->InitData(p_fmat, p_tree, hess);
Driver<CPUExpandEntry> driver(*param_);
auto &tree = *p_tree;
@@ -232,7 +223,7 @@ class GloablApproxBuilder {
best_splits.push_back(l_best);
best_splits.push_back(r_best);
}
auto const &histograms = histogram_builder_.Histogram();
auto const &histograms = histogram_builder_.Histogram(0);
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
monitor_->Start("EvaluateSplits");
evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &best_splits);
@@ -260,6 +251,7 @@ class GlobalApproxUpdater : public TreeUpdater {
std::shared_ptr<common::ColumnSampler> column_sampler_ =
std::make_shared<common::ColumnSampler>();
ObjInfo const *task_;
HistMakerTrainParam hist_param_;
public:
explicit GlobalApproxUpdater(Context const *ctx, ObjInfo const *task)
@@ -267,25 +259,33 @@ class GlobalApproxUpdater : public TreeUpdater {
monitor_.Init(__func__);
}
void Configure(Args const &) override {}
void LoadConfig(Json const &) override {}
void SaveConfig(Json *) const override {}
void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
void LoadConfig(Json const &in) override {
auto const &config = get<Object const>(in);
FromJson(config.at("hist_train_param"), &hist_param_);
}
void SaveConfig(Json *p_out) const override {
auto &out = *p_out;
out["hist_train_param"] = ToJson(hist_param_);
}
void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
void InitData(TrainParam const &param, linalg::Matrix<GradientPair> const *gpair,
linalg::Matrix<GradientPair> *sampled) {
*sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
sampled->Data()->Copy(*gpair);
auto in = gpair->HostView().Values();
std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
SampleGradient(ctx_, param, sampled->HostView());
}
[[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *m,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) override {
pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
&monitor_);
CHECK(hist_param_.GetInitialised());
pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
column_sampler_, task_, &monitor_);
linalg::Matrix<GradientPair> h_gpair;
// Obtain the hessian values for weighted sketching
@@ -300,6 +300,7 @@ class GlobalApproxUpdater : public TreeUpdater {
std::size_t t_idx = 0;
for (auto p_tree : trees) {
this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
hist_param_.CheckTreesSynchronized(p_tree);
++t_idx;
}
}

View File

@@ -91,7 +91,7 @@ class ColMaker: public TreeUpdater {
}
}
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree *> &trees) override {
if (collective::IsDistributed()) {
@@ -106,10 +106,11 @@ class ColMaker: public TreeUpdater {
// rescale learning rate according to size of trees
interaction_constraints_.Configure(*param, dmat->Info().num_row_);
// build tree
CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
for (auto tree : trees) {
CHECK(ctx_);
Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
builder.Update(gpair->ConstHostVector(), dmat, tree);
builder.Update(gpair->Data()->ConstHostVector(), dmat, tree);
}
}

View File

@@ -72,7 +72,6 @@ struct DeviceSplitCandidate {
// split.
bst_cat_t thresh{-1};
common::CatBitField split_cats;
bool is_cat { false };
GradientPairInt64 left_sum;
@@ -80,12 +79,6 @@ struct DeviceSplitCandidate {
XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT
template <typename T>
XGBOOST_DEVICE void SetCat(T c) {
this->split_cats.Set(common::AsCat(c));
fvalue = std::max(this->fvalue, static_cast<float>(c));
}
XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
int findex_in, GradientPairInt64 left_sum_in,
GradientPairInt64 right_sum_in, bool cat,
@@ -108,22 +101,23 @@ struct DeviceSplitCandidate {
*/
XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
bst_feature_t findex_in, GradientPairInt64 left_sum_in,
GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) {
if (loss_chg_in > loss_chg &&
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
loss_chg = loss_chg_in;
dir = dir_in;
fvalue = std::numeric_limits<float>::quiet_NaN();
thresh = thresh_in;
is_cat = true;
left_sum = left_sum_in;
right_sum = right_sum_in;
findex = findex_in;
}
GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
const GradientQuantiser& quantiser) {
if (loss_chg_in > loss_chg &&
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
loss_chg = loss_chg_in;
dir = dir_in;
fvalue = std::numeric_limits<float>::quiet_NaN();
thresh = thresh_in;
is_cat = true;
left_sum = left_sum_in;
right_sum = right_sum_in;
findex = findex_in;
}
}
XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
[[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
os << "loss_chg:" << c.loss_chg << ", "

View File

@@ -7,12 +7,13 @@
#include <algorithm>
#include <cmath>
#include <limits>
#include <memory>
#include <utility>
#include <cstddef> // for size_t
#include <memory> // for unique_ptr, make_unique
#include <utility> // for move
#include <vector>
#include "../collective/communicator-inl.cuh"
#include "../collective/aggregator.h"
#include "../collective/aggregator.cuh"
#include "../common/bitfield.h"
#include "../common/categorical.h"
@@ -22,6 +23,7 @@
#include "../common/io.h"
#include "../common/timer.h"
#include "../data/ellpack_page.cuh"
#include "../data/ellpack_page.h"
#include "constraints.cuh"
#include "driver.h"
#include "gpu_hist/evaluate_splits.cuh"
@@ -30,8 +32,8 @@
#include "gpu_hist/gradient_based_sampler.cuh"
#include "gpu_hist/histogram.cuh"
#include "gpu_hist/row_partitioner.cuh"
#include "hist/param.h"
#include "param.h"
#include "split_evaluator.h"
#include "updater_gpu_common.cuh"
#include "xgboost/base.h"
#include "xgboost/context.h"
@@ -48,20 +50,6 @@ namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
#endif // !defined(GTEST_TEST)
// training parameters specific to this algorithm
struct GPUHistMakerTrainParam
: public XGBoostParameter<GPUHistMakerTrainParam> {
bool debug_synchronize;
// declare parameters
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
"Check if all distributed tree are identical after tree construction.");
}
};
#if !defined(GTEST_TEST)
DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
#endif // !defined(GTEST_TEST)
/**
* \struct DeviceHistogramStorage
*
@@ -170,16 +158,16 @@ class DeviceHistogramStorage {
};
// Manage memory for a single GPU
template <typename GradientSumT>
struct GPUHistMakerDevice {
private:
GPUHistEvaluator evaluator_;
Context const* ctx_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
MetaInfo const& info_;
public:
EllpackPageImpl const* page;
EllpackPageImpl const* page{nullptr};
common::Span<FeatureType const> feature_types;
BatchParam batch_param;
std::unique_ptr<RowPartitioner> row_partitioner;
DeviceHistogramStorage<> hist{};
@@ -199,98 +187,95 @@ struct GPUHistMakerDevice {
dh::PinnedMemory pinned2;
common::Monitor monitor;
common::ColumnSampler column_sampler;
FeatureInteractionConstraintDevice interaction_constraints;
std::unique_ptr<GradientBasedSampler> sampler;
std::unique_ptr<FeatureGroups> feature_groups;
GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
BatchParam _batch_param)
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
: evaluator_{_param, n_features, ctx->gpu_id},
ctx_(ctx),
page(_page),
feature_types{_feature_types},
param(std::move(_param)),
column_sampler(column_sampler_seed),
column_sampler_(std::move(column_sampler)),
interaction_constraints(param, n_features),
batch_param(std::move(_batch_param)) {
sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
param.sampling_method));
info_{info} {
sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
param.sampling_method, is_external_memory);
if (!param.monotone_constraints.empty()) {
// Copy assigning an empty vector causes an exception in MSVC debug builds
monotone_constraints = param.monotone_constraints;
}
// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
CHECK(column_sampler_);
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientSumT)));
}
~GPUHistMakerDevice() { // NOLINT
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
~GPUHistMakerDevice() = default;
void InitFeatureGroupsOnce() {
if (!feature_groups) {
CHECK(page);
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientPairPrecise));
}
}
// Reset values for each update iteration
// Note that the column sampler must be passed by value because it is not
// thread safe
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
auto const& info = dmat->Info();
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
#endif
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
ctx_->gpu_id);
this->interaction_constraints.Reset();
if (d_gpair.size() != dh_gpair->Size()) {
d_gpair.resize(dh_gpair->Size());
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
cudaMemcpyDeviceToDevice));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair), hipMemcpyDeviceToDevice));
dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
hipMemcpyDeviceToDevice));
#endif
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
page = sample.page;
gpair = sample.gpair;
quantiser.reset(new GradientQuantiser(this->gpair));
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
dmat->Info().IsColumnSplit(), ctx_->gpu_id);
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
hist.Reset();
this->InitFeatureGroupsOnce();
}
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
int nidx = RegTree::kRoot;
GPUTrainingParam gpu_param(param);
auto sampled_features = column_sampler.GetFeatureSet(0);
sampled_features->SetDevice(ctx_->gpu_id);
auto sampled_features = column_sampler_->GetFeatureSet(0);
sampled_features->SetDevice(ctx_->Device());
common::Span<bst_feature_t> feature_set =
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@@ -324,19 +309,19 @@ struct GPUHistMakerDevice {
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
// Store the feature set ptrs so they dont go out of scope before the kernel is called
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
for (size_t i = 0; i < candidates.size(); i++) {
for (std::size_t i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild();
int right_nidx = tree[candidate.nid].RightChild();
nidx[i * 2] = left_nidx;
nidx[i * 2 + 1] = right_nidx;
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->Device());
feature_sets.emplace_back(left_sampled_features);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
auto right_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->Device());
feature_sets.emplace_back(right_sampled_features);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
@@ -363,10 +348,8 @@ struct GPUHistMakerDevice {
h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
#endif
this->evaluator_.EvaluateSplits(nidx, max_active_features,
dh::ToSpan(d_node_inputs), shared_inputs,
dh::ToSpan(entries));
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
@@ -378,7 +361,7 @@ struct GPUHistMakerDevice {
#endif
dh::DefaultStream().Sync();
}
}
void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx);
@@ -410,31 +393,108 @@ struct GPUHistMakerDevice {
struct NodeSplitData {
RegTree::Node split_node;
FeatureType split_type;
common::CatBitField node_cats;
common::KCatBitField node_cats;
};
void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
if (candidates.empty()) return;
std::vector<int> nidx(candidates.size());
std::vector<int> left_nidx(candidates.size());
std::vector<int> right_nidx(candidates.size());
void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
std::vector<NodeSplitData> const& split_data,
std::vector<bst_node_t> const& nidx,
std::vector<bst_node_t> const& left_nidx,
std::vector<bst_node_t> const& right_nidx) {
auto const num_candidates = split_data.size();
using BitVector = LBitField64;
using BitType = BitVector::value_type;
auto const size = BitVector::ComputeStorageSize(d_matrix.n_rows * num_candidates);
dh::TemporaryArray<BitType> decision_storage(size, 0);
dh::TemporaryArray<BitType> missing_storage(size, 0);
BitVector decision_bits{dh::ToSpan(decision_storage)};
BitVector missing_bits{dh::ToSpan(missing_storage)};
dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
#endif
auto d_split_data = dh::ToSpan(split_data_storage);
dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
for (auto i = 0; i < num_candidates; i++) {
auto const& data = d_split_data[i];
auto const cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
if (isnan(cut_value)) {
missing_bits.Set(ridx * num_candidates + i);
} else {
bool go_left;
if (data.split_type == FeatureType::kCategorical) {
go_left = common::Decision(data.node_cats.Bits(), cut_value);
} else {
go_left = cut_value <= data.split_node.SplitCond();
}
if (go_left) {
decision_bits.Set(ridx * num_candidates + i);
}
}
}
});
collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
collective::Synchronize(ctx_->gpu_id);
row_partitioner->UpdatePositionBatch(
nidx, left_nidx, right_nidx, split_data,
[=] __device__(bst_uint ridx, int split_index, NodeSplitData const& data) {
auto const index = ridx * num_candidates + split_index;
bool go_left;
if (missing_bits.Check(index)) {
go_left = data.split_node.DefaultLeft();
} else {
go_left = decision_bits.Check(index);
}
return go_left;
});
}
void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
if (candidates.empty()) {
return;
}
std::vector<bst_node_t> nidx(candidates.size());
std::vector<bst_node_t> left_nidx(candidates.size());
std::vector<bst_node_t> right_nidx(candidates.size());
std::vector<NodeSplitData> split_data(candidates.size());
for (size_t i = 0; i < candidates.size(); i++) {
auto& e = candidates[i];
auto const& e = candidates[i];
RegTree::Node split_node = (*p_tree)[e.nid];
auto split_type = p_tree->NodeSplitType(e.nid);
nidx.at(i) = e.nid;
left_nidx.at(i) = split_node.LeftChild();
right_nidx.at(i) = split_node.RightChild();
split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
}
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
if (info_.IsColumnSplit()) {
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
return;
}
row_partitioner->UpdatePositionBatch(
nidx, left_nidx, right_nidx, split_data,
[=] __device__(bst_uint ridx, const NodeSplitData& data) {
[=] __device__(bst_uint ridx, int split_index, const NodeSplitData& data) {
// given a row index, returns the node id it belongs to
bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
// Missing value
bool go_left = true;
if (isnan(cut_value)) {
@@ -569,14 +629,14 @@ struct GPUHistMakerDevice {
}
CHECK(p_tree);
CHECK(out_preds_d.Device().IsCUDA());
CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
#endif
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
auto d_position = dh::ToSpan(positions);
CHECK_EQ(out_preds_d.Size(), d_position.size());
@@ -609,9 +669,8 @@ struct GPUHistMakerDevice {
monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
collective::AllReduce<collective::Operation::kSum>(
ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms);
collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms);
monitor.Stop("AllReduce");
}
@@ -692,7 +751,6 @@ struct GPUHistMakerDevice {
CHECK(common::CheckNAN(candidate.split.fvalue));
std::vector<common::CatBitField::value_type> split_cats;
CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
@@ -713,7 +771,6 @@ struct GPUHistMakerDevice {
evaluator_.ApplyTreeSplit(candidate, p_tree);
const auto& parent = tree[candidate.nid];
std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
parent.RightChild());
}
@@ -730,8 +787,7 @@ struct GPUHistMakerDevice {
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<ReduceT *>(&root_sum_quantised), 2);
collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
hist.AllocateHistograms({kRootNIdx});
this->BuildHist(kRootNIdx);
@@ -749,9 +805,8 @@ struct GPUHistMakerDevice {
return root_entry;
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
ObjInfo const* task, RegTree* p_tree,
HostDeviceVector<bst_node_t>* p_out_position) {
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
auto& tree = *p_tree;
// Process maximum 32 nodes at a time
Driver<GPUExpandEntry> driver(param, 32);
@@ -776,7 +831,6 @@ struct GPUHistMakerDevice {
std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
[&](const auto& e) { return driver.IsChildValid(e); });
auto new_candidates =
pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
@@ -809,8 +863,7 @@ class GPUHistMaker : public TreeUpdater {
using GradientSumT = GradientPairPrecise;
public:
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task)
: TreeUpdater(ctx), task_{task} {};
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
void Configure(const Args& args) override {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Hist]: Configure";
@@ -823,32 +876,31 @@ class GPUHistMaker : public TreeUpdater {
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
initialised_ = false;
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
out["hist_train_param"] = ToJson(hist_maker_param_);
}
~GPUHistMaker() { // NOLINT
dh::GlobalMemoryLogger().Log();
}
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override {
monitor_.Start("Update");
CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
auto gpair_hdv = gpair->Data();
// build tree
try {
size_t t_idx{0};
std::size_t t_idx{0};
for (xgboost::RegTree* tree : trees) {
this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
if (hist_maker_param_.debug_synchronize) {
this->CheckTreesSynchronized(tree);
}
this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
this->hist_maker_param_.CheckTreesSynchronized(tree);
++t_idx;
}
@@ -870,9 +922,9 @@ class GPUHistMaker : public TreeUpdater {
// Synchronise the column sampling seed
uint32_t column_sampling_seed = common::GlobalRandom()();
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
#elif defined(XGBOOST_USE_HIP)
@@ -880,9 +932,9 @@ class GPUHistMaker : public TreeUpdater {
#endif
info_->feature_types.SetDevice(ctx_->gpu_id);
maker.reset(new GPUHistMakerDevice<GradientSumT>(
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
column_sampling_seed, info_->num_col_, batch_param));
maker = std::make_unique<GPUHistMakerDevice>(
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
p_last_fmat_ = dmat;
initialised_ = true;
@@ -895,21 +947,7 @@ class GPUHistMaker : public TreeUpdater {
monitor_.Stop("InitDataOnce");
}
p_last_tree_ = p_tree;
}
// Only call this method for testing
void CheckTreesSynchronized(RegTree* local_tree) const {
std::string s_model;
common::MemoryBufferStream fs(&s_model);
int rank = collective::GetRank();
if (rank == 0) {
local_tree->Save(&fs);
}
fs.Seek(0);
collective::Broadcast(&s_model, 0);
RegTree reference_tree{}; // rank 0 tree
reference_tree.Load(&fs);
CHECK(*local_tree == reference_tree);
CHECK(hist_maker_param_.GetInitialised());
}
void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
@@ -935,7 +973,7 @@ class GPUHistMaker : public TreeUpdater {
MetaInfo* info_{}; // NOLINT
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
std::unique_ptr<GPUHistMakerDevice> maker; // NOLINT
[[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
[[nodiscard]] bool HasNodePosition() const override { return true; }
@@ -943,13 +981,14 @@ class GPUHistMaker : public TreeUpdater {
private:
bool initialised_{false};
GPUHistMakerTrainParam hist_maker_param_;
HistMakerTrainParam hist_maker_param_;
DMatrix* p_last_fmat_{nullptr};
RegTree const* p_last_tree_{nullptr};
ObjInfo const* task_{nullptr};
common::Monitor monitor_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
};
#if !defined(GTEST_TEST)
@@ -959,4 +998,135 @@ XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
return new GPUHistMaker(ctx, task);
});
#endif // !defined(GTEST_TEST)
class GPUGlobalApproxMaker : public TreeUpdater {
public:
explicit GPUGlobalApproxMaker(Context const* ctx, ObjInfo const* task)
: TreeUpdater(ctx), task_{task} {};
void Configure(Args const& args) override {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Approx]: Configure";
hist_maker_param_.UpdateAllowUnknown(args);
if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
}
dh::CheckComputeCapability();
initialised_ = false;
monitor_.Init(this->Name());
}
void LoadConfig(Json const& in) override {
auto const& config = get<Object const>(in);
FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
initialised_ = false;
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["hist_train_param"] = ToJson(hist_maker_param_);
}
~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override {
monitor_.Start("Update");
this->InitDataOnce(p_fmat);
// build tree
hess_.resize(gpair->Size());
auto hess = dh::ToSpan(hess_);
gpair->SetDevice(ctx_->Device());
auto d_gpair = gpair->Data()->ConstDeviceSpan();
auto cuctx = ctx_->CUDACtx();
thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess),
[=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); });
auto const& info = p_fmat->Info();
info.feature_types.SetDevice(ctx_->Device());
auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
maker_ = std::make_unique<GPUHistMakerDevice>(
ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
*param, column_sampler_, info.num_col_, batch, p_fmat->Info());
std::size_t t_idx{0};
for (xgboost::RegTree* tree : trees) {
this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
this->hist_maker_param_.CheckTreesSynchronized(tree);
++t_idx;
}
monitor_.Stop("Update");
}
void InitDataOnce(DMatrix* p_fmat) {
if (this->initialised_) {
return;
}
monitor_.Start(__func__);
CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal();
// Synchronise the column sampling seed
uint32_t column_sampling_seed = common::GlobalRandom()();
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
p_last_fmat_ = p_fmat;
initialised_ = true;
monitor_.Stop(__func__);
}
void InitData(DMatrix* p_fmat, RegTree const* p_tree) {
this->InitDataOnce(p_fmat);
p_last_tree_ = p_tree;
CHECK(hist_maker_param_.GetInitialised());
}
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
HostDeviceVector<bst_node_t>* p_out_position) {
monitor_.Start("InitData");
this->InitData(p_fmat, p_tree);
monitor_.Stop("InitData");
gpair->SetDevice(ctx_->gpu_id);
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
}
bool UpdatePredictionCache(const DMatrix* data,
linalg::MatrixView<bst_float> p_out_preds) override {
if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
return false;
}
monitor_.Start("UpdatePredictionCache");
bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_);
monitor_.Stop("UpdatePredictionCache");
return result;
}
[[nodiscard]] char const* Name() const override { return "grow_gpu_approx"; }
[[nodiscard]] bool HasNodePosition() const override { return true; }
private:
bool initialised_{false};
HistMakerTrainParam hist_maker_param_;
dh::device_vector<float> hess_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
std::unique_ptr<GPUHistMakerDevice> maker_;
DMatrix* p_last_fmat_{nullptr};
RegTree const* p_last_tree_{nullptr};
ObjInfo const* task_{nullptr};
common::Monitor monitor_;
};
#if !defined(GTEST_TEST)
XGBOOST_REGISTER_TREE_UPDATER(GPUApproxMaker, "grow_gpu_approx")
.describe("Grow tree with GPU.")
.set_body([](Context const* ctx, ObjInfo const* task) {
return new GPUGlobalApproxMaker(ctx, task);
});
#endif // !defined(GTEST_TEST)
} // namespace xgboost::tree

View File

@@ -31,7 +31,7 @@ class TreePruner : public TreeUpdater {
[[nodiscard]] bool CanModifyTree() const override { return true; }
// update the tree, do pruning
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& trees) override {
pruner_monitor_.Start("PrunerUpdate");

View File

@@ -4,39 +4,40 @@
* \brief use quantized feature values to construct a tree
* \author Philip Cho, Tianqi Checn, Egor Smirnov
*/
#include <algorithm> // for max, copy, transform
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t, int32_t
#include <memory> // for unique_ptr, allocator, make_unique, shared_ptr
#include <numeric> // for accumulate
#include <ostream> // for basic_ostream, char_traits, operator<<
#include <utility> // for move, swap
#include <vector> // for vector
#include <algorithm> // for max, copy, transform
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t, int32_t
#include <exception> // for exception
#include <memory> // for allocator, unique_ptr, make_unique, shared_ptr
#include <ostream> // for operator<<, basic_ostream, char_traits
#include <utility> // for move
#include <vector> // for vector
#include "../collective/aggregator.h" // for GlobalSum
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
#include "../collective/communicator.h" // for Operation
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
#include "../collective/communicator-inl.h" // for IsDistributed
#include "../common/hist_util.h" // for HistogramCuts, GHistRow
#include "../common/linalg_op.h" // for begin, cbegin, cend
#include "../common/random.h" // for ColumnSampler
#include "../common/threading_utils.h" // for ParallelFor
#include "../common/timer.h" // for Monitor
#include "../common/transform_iterator.h" // for IndexTransformIter, MakeIndexTransformIter
#include "../common/transform_iterator.h" // for IndexTransformIter
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "common_row_partitioner.h" // for CommonRowPartitioner
#include "dmlc/omp.h" // for omp_get_thread_num
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
#include "driver.h" // for Driver
#include "hist/evaluate_splits.h" // for HistEvaluator, HistMultiEvaluator, UpdatePre...
#include "hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "hist/histogram.h" // for HistogramBuilder, ConstructHistSpace
#include "hist/hist_cache.h" // for BoundedHistCollection
#include "hist/histogram.h" // for MultiHistogramBuilder
#include "hist/param.h" // for HistMakerTrainParam
#include "hist/sampler.h" // for SampleGradient
#include "param.h" // for TrainParam, SplitEntryContainer, GradStats
#include "xgboost/base.h" // for GradientPairInternal, GradientPair, bst_targ...
#include "param.h" // for TrainParam, GradStats
#include "xgboost/base.h" // for Args, GradientPairPrecise, GradientPair, Gra...
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for BatchIterator, BatchSet, DMatrix, MetaInfo
#include "xgboost/data.h" // for BatchSet, DMatrix, BatchIterator, MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for All, MatrixView, TensorView, Matrix, Empty
#include "xgboost/json.h" // for Object, Json, FromJson, ToJson, get
#include "xgboost/linalg.h" // for MatrixView, TensorView, All, Matrix, Empty
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
#include "xgboost/span.h" // for Span, operator!=, SpanIterator
#include "xgboost/string_view.h" // for operator<<
@@ -117,10 +118,11 @@ class MultiTargetHistBuilder {
private:
common::Monitor *monitor_{nullptr};
TrainParam const *param_{nullptr};
HistMakerTrainParam const *hist_param_{nullptr};
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistMultiEvaluator> evaluator_;
// Histogram builder for each target.
std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
Context const *ctx_{nullptr};
// Partitioner for each data batch.
std::vector<CommonRowPartitioner> partitioner_;
@@ -150,7 +152,6 @@ class MultiTargetHistBuilder {
monitor_->Start(__func__);
p_last_fmat_ = p_fmat;
std::size_t page_id = 0;
bst_bin_t n_total_bins = 0;
partitioner_.clear();
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -160,16 +161,13 @@ class MultiTargetHistBuilder {
CHECK_EQ(n_total_bins, page.cut.TotalBins());
}
partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->Info().IsColumnSplit());
page_id++;
}
bst_target_t n_targets = p_tree->NumTargets();
histogram_builder_.clear();
for (std::size_t i = 0; i < n_targets; ++i) {
histogram_builder_.emplace_back();
histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
}
histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_),
collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
hist_param_);
evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
p_last_tree_ = p_tree;
@@ -204,17 +202,7 @@ class MultiTargetHistBuilder {
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
root_sum.Size() * 2);
std::vector<MultiExpandEntry> nodes{best};
std::size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
for (bst_target_t t{0}; t < n_targets; ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes, {}, t_gpair.Values());
}
i++;
}
histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
auto weight = evaluator_->InitRoot(root_sum);
auto weight_t = weight.HostView();
@@ -222,9 +210,10 @@ class MultiTargetHistBuilder {
[&](float w) { return w * param_->learning_rate; });
p_tree->SetLeaf(RegTree::kRoot, weight_t);
std::vector<common::HistCollection const *> hists;
std::vector<BoundedHistCollection const *> hists;
std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
hists.push_back(&histogram_builder_[t].Histogram());
hists.push_back(&(*histogram_builder_).Histogram(t));
}
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
@@ -239,50 +228,17 @@ class MultiTargetHistBuilder {
std::vector<MultiExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
std::vector<MultiExpandEntry> nodes_to_build;
std::vector<MultiExpandEntry> nodes_to_sub;
for (auto const &c : valid_candidates) {
auto left_nidx = p_tree->LeftChild(c.nid);
auto right_nidx = p_tree->RightChild(c.nid);
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
auto lit =
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
auto rit =
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
auto fewer_right = right_sum < left_sum;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
}
std::size_t i = 0;
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
auto t_gpair = gpair.Slice(linalg::All(), t);
// Make sure the gradient matrix is f-order.
CHECK(t_gpair.Contiguous());
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
nodes_to_build, nodes_to_sub, t_gpair.Values());
}
i++;
}
histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
HistBatch(param_));
monitor_->Stop(__func__);
}
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<MultiExpandEntry> *best_splits) {
monitor_->Start(__func__);
std::vector<common::HistCollection const *> hists;
std::vector<BoundedHistCollection const *> hists;
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
hists.push_back(&histogram_builder_[t].Histogram());
hists.push_back(&(*histogram_builder_).Histogram(t));
}
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
@@ -306,10 +262,12 @@ class MultiTargetHistBuilder {
public:
explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
HistMakerTrainParam const *hist_param,
std::shared_ptr<common::ColumnSampler> column_sampler,
ObjInfo const *task, common::Monitor *monitor)
: monitor_{monitor},
param_{param},
hist_param_{hist_param},
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
ctx_{ctx},
@@ -331,10 +289,14 @@ class MultiTargetHistBuilder {
}
};
class HistBuilder {
/**
* @brief Tree updater for single-target trees.
*/
class HistUpdater {
private:
common::Monitor *monitor_;
TrainParam const *param_;
HistMakerTrainParam const *hist_param_{nullptr};
std::shared_ptr<common::ColumnSampler> col_sampler_;
std::unique_ptr<HistEvaluator> evaluator_;
std::vector<CommonRowPartitioner> partitioner_;
@@ -343,22 +305,22 @@ class HistBuilder {
const RegTree *p_last_tree_{nullptr};
DMatrix const *const p_last_fmat_{nullptr};
std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
ObjInfo const *task_{nullptr};
// Context for number of threads
Context const *ctx_{nullptr};
public:
explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
common::Monitor *monitor)
explicit HistUpdater(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
TrainParam const *param, HistMakerTrainParam const *hist_param,
DMatrix const *fmat, ObjInfo const *task, common::Monitor *monitor)
: monitor_{monitor},
param_{param},
hist_param_{hist_param},
col_sampler_{std::move(column_sampler)},
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
col_sampler_)},
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(), col_sampler_)},
p_last_fmat_(fmat),
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
histogram_builder_{new MultiHistogramBuilder},
task_{task},
ctx_{ctx} {
monitor_->Init(__func__);
@@ -381,7 +343,6 @@ class HistBuilder {
// initialize temp data structure
void InitData(DMatrix *fmat, RegTree const *p_tree) {
monitor_->Start(__func__);
std::size_t page_id{0};
bst_bin_t n_total_bins{0};
partitioner_.clear();
for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -392,10 +353,9 @@ class HistBuilder {
}
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid,
fmat->Info().IsColumnSplit());
++page_id;
}
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
collective::IsDistributed(), fmat->Info().IsColumnSplit());
histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
fmat->Info().IsColumnSplit(), hist_param_);
evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
p_last_tree_ = p_tree;
monitor_->Stop(__func__);
@@ -404,7 +364,7 @@ class HistBuilder {
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
std::vector<CPUExpandEntry> *best_splits) {
monitor_->Start(__func__);
auto const &histograms = histogram_builder_->Histogram();
auto const &histograms = histogram_builder_->Histogram(0);
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
@@ -422,16 +382,8 @@ class HistBuilder {
monitor_->Start(__func__);
CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
std::size_t page_id = 0;
auto space = ConstructHistSpace(partitioner_, {node});
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
std::vector<CPUExpandEntry> nodes_to_build{node};
std::vector<CPUExpandEntry> nodes_to_sub;
this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
++page_id;
}
this->histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, node,
HistBatch(param_));
{
GradientPairPrecise grad_stat;
@@ -445,7 +397,7 @@ class HistBuilder {
CHECK_GE(row_ptr.size(), 2);
std::uint32_t const ibegin = row_ptr[0];
std::uint32_t const iend = row_ptr[1];
auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
auto hist = this->histogram_builder_->Histogram(0)[RegTree::kRoot];
auto begin = hist.data();
for (std::uint32_t i = ibegin; i < iend; ++i) {
GradientPairPrecise const &et = begin[i];
@@ -468,7 +420,7 @@ class HistBuilder {
monitor_->Start("EvaluateSplits");
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
evaluator_->EvaluateSplits(histogram_builder_->Histogram(0), gmat.cut, ft, *p_tree,
&entries);
break;
}
@@ -484,33 +436,8 @@ class HistBuilder {
std::vector<CPUExpandEntry> const &valid_candidates,
linalg::MatrixView<GradientPair const> gpair) {
monitor_->Start(__func__);
std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
std::size_t n_idx = 0;
for (auto const &c : valid_candidates) {
auto left_nidx = (*p_tree)[c.nid].LeftChild();
auto right_nidx = (*p_tree)[c.nid].RightChild();
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
auto build_nidx = left_nidx;
auto subtract_nidx = right_nidx;
if (fewer_right) {
std::swap(build_nidx, subtract_nidx);
}
nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
n_idx++;
}
std::size_t page_id{0};
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
partitioner_.at(page_id).Partitions(), nodes_to_build,
nodes_to_sub, gpair.Values());
++page_id;
}
this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
gpair, HistBatch(param_));
monitor_->Stop(__func__);
}
@@ -529,7 +456,7 @@ class HistBuilder {
std::vector<bst_node_t> *p_out_position) {
monitor_->Start(__func__);
if (!task_->UpdateTreeLeaf()) {
monitor_->Stop(__func__);
monitor_->Stop(__func__);
return;
}
for (auto const &part : partitioner_) {
@@ -541,42 +468,50 @@ class HistBuilder {
/*! \brief construct a tree using quantized feature values */
class QuantileHistMaker : public TreeUpdater {
std::unique_ptr<HistBuilder> p_impl_{nullptr};
std::unique_ptr<HistUpdater> p_impl_{nullptr};
std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
std::shared_ptr<common::ColumnSampler> column_sampler_ =
std::make_shared<common::ColumnSampler>();
common::Monitor monitor_;
ObjInfo const *task_{nullptr};
HistMakerTrainParam hist_param_;
public:
explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
: TreeUpdater{ctx}, task_{task} {}
void Configure(const Args &) override {}
void LoadConfig(Json const &) override {}
void SaveConfig(Json *) const override {}
void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
void LoadConfig(Json const &in) override {
auto const &config = get<Object const>(in);
FromJson(config.at("hist_train_param"), &hist_param_);
}
void SaveConfig(Json *p_out) const override {
auto &out = *p_out;
out["hist_train_param"] = ToJson(hist_param_);
}
[[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) override {
if (trees.front()->IsMultiTarget()) {
CHECK(hist_param_.GetInitialised());
CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
if (!p_mtimpl_) {
this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
ctx_, p_fmat->Info(), param, &hist_param_, column_sampler_, task_, &monitor_);
}
} else {
CHECK(hist_param_.GetInitialised());
if (!p_impl_) {
p_impl_ =
std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
p_impl_ = std::make_unique<HistUpdater>(ctx_, column_sampler_, param, &hist_param_, p_fmat,
task_, &monitor_);
}
}
bst_target_t n_targets = trees.front()->NumTargets();
auto h_gpair =
linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
auto h_gpair = gpair->HostView();
linalg::Matrix<GradientPair> sample_out;
auto h_sample_out = h_gpair;
@@ -601,6 +536,8 @@ class QuantileHistMaker : public TreeUpdater {
UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
h_out_position, *tree_it);
}
hist_param_.CheckTreesSynchronized(*tree_it);
}
}

View File

@@ -31,11 +31,14 @@ class TreeRefresher : public TreeUpdater {
[[nodiscard]] char const *Name() const override { return "refresh"; }
[[nodiscard]] bool CanModifyTree() const override { return true; }
// update the tree, do pruning
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree *> &trees) override {
if (trees.size() == 0) return;
const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
if (trees.size() == 0) {
return;
}
CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
const std::vector<GradientPair> &gpair_h = gpair->Data()->ConstHostVector();
// thread temporal space
std::vector<std::vector<GradStats> > stemp;
std::vector<RegTree::FVec> fvec_temp;

View File

@@ -31,7 +31,7 @@ class TreeSyncher : public TreeUpdater {
[[nodiscard]] char const* Name() const override { return "prune"; }
void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
const std::vector<RegTree*>& trees) override {
if (collective::GetWorldSize() == 1) return;