temp merge, disable 1 line, SetValid
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2019 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost contributors
|
||||
*/
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_vector.h>
|
||||
@@ -140,20 +140,20 @@ void FeatureInteractionConstraintDevice::Reset() {
|
||||
__global__ void ClearBuffersKernel(
|
||||
LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < result_buffer_output.Size()) {
|
||||
if (tid < result_buffer_output.Capacity()) {
|
||||
result_buffer_output.Clear(tid);
|
||||
}
|
||||
if (tid < result_buffer_input.Size()) {
|
||||
if (tid < result_buffer_input.Capacity()) {
|
||||
result_buffer_input.Clear(tid);
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureInteractionConstraintDevice::ClearBuffers() {
|
||||
CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
|
||||
CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
|
||||
CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
|
||||
CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto const n_grids = static_cast<uint32_t>(
|
||||
common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
|
||||
common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
ClearBuffersKernel,
|
||||
output_buffer_bits_, input_buffer_bits_);
|
||||
@@ -207,11 +207,11 @@ common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
|
||||
ClearBuffers();
|
||||
|
||||
LBitField64 node_constraints = s_node_constraints_[nid];
|
||||
CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
|
||||
CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());
|
||||
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto n_grids = static_cast<uint32_t>(
|
||||
common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
|
||||
common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
SetInputBufferKernel,
|
||||
feature_list, input_buffer_bits_);
|
||||
@@ -274,13 +274,13 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
|
||||
LBitField64 left,
|
||||
LBitField64 right) {
|
||||
auto tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (tid > node.Size()) {
|
||||
if (tid > node.Capacity()) {
|
||||
return;
|
||||
}
|
||||
// enable constraints from feature
|
||||
node |= feature;
|
||||
// clear the buffer after use
|
||||
if (tid < feature.Size()) {
|
||||
if (tid < feature.Capacity()) {
|
||||
feature.Clear(tid);
|
||||
}
|
||||
|
||||
@@ -323,7 +323,7 @@ void FeatureInteractionConstraintDevice::Split(
|
||||
s_sets_, s_sets_ptr_);
|
||||
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));
|
||||
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));
|
||||
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
InteractionConstraintSplitKernel,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2018-2019 by Contributors
|
||||
/**
|
||||
* Copyright 2018-2023 by Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_TREE_CONSTRAINTS_H_
|
||||
#define XGBOOST_TREE_CONSTRAINTS_H_
|
||||
@@ -8,10 +8,8 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/base.h"
|
||||
|
||||
#include "param.h"
|
||||
#include "xgboost/base.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*!
|
||||
|
||||
@@ -55,27 +55,26 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
} // namespace cpu_impl
|
||||
|
||||
namespace cuda_impl {
|
||||
void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
|
||||
linalg::VectorView<float> out);
|
||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out);
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
|
||||
inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<GradientPair const, 2>,
|
||||
linalg::VectorView<float>) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
|
||||
} // namespace cuda_impl
|
||||
|
||||
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
|
||||
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
|
||||
bst_target_t n_targets, linalg::Vector<float>* out) {
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
out->Reshape(n_targets);
|
||||
auto n_samples = gpair.Size() / n_targets;
|
||||
|
||||
gpair.SetDevice(ctx->gpu_id);
|
||||
auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
|
||||
gpair.SetDevice(ctx->Device());
|
||||
auto gpair_t = gpair.View(ctx->Device());
|
||||
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
|
||||
: cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
|
||||
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2022 by XGBoost Contributors
|
||||
* Copyright 2022-2023 by XGBoost Contributors
|
||||
*
|
||||
* \brief Utilities for estimating initial score.
|
||||
*/
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include <cstddef> // std::size_t
|
||||
|
||||
#include "../collective/aggregator.cuh"
|
||||
#include "../collective/communicator-inl.cuh"
|
||||
#include "../common/device_helpers.cuh" // dh::MakeTransformIterator
|
||||
#include "fit_stump.h"
|
||||
@@ -23,8 +24,8 @@
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace cuda_impl {
|
||||
void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
|
||||
linalg::VectorView<float> out) {
|
||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
||||
auto n_targets = out.Size();
|
||||
CHECK_EQ(n_targets, gpair.Shape(1));
|
||||
linalg::Vector<GradientPairPrecise> sum = linalg::Constant(ctx, GradientPairPrecise{}, n_targets);
|
||||
@@ -41,7 +42,7 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
|
||||
auto sample = i % gpair.Shape(0);
|
||||
return GradientPairPrecise{gpair(sample, target)};
|
||||
});
|
||||
auto d_sum = sum.View(ctx->gpu_id);
|
||||
auto d_sum = sum.View(ctx->Device());
|
||||
CHECK(d_sum.CContiguous());
|
||||
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
@@ -55,8 +56,8 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
|
||||
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
||||
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
||||
|
||||
collective::AllReduce<collective::Operation::kSum>(
|
||||
ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()), d_sum.Size() * 2);
|
||||
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
|
||||
d_sum.Size() * 2);
|
||||
|
||||
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
|
||||
[=] XGBOOST_DEVICE(std::size_t i) mutable {
|
||||
|
||||
@@ -31,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
|
||||
/**
|
||||
* @brief Fit a tree stump as an estimation of base_score.
|
||||
*/
|
||||
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
|
||||
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
|
||||
bst_target_t n_targets, linalg::Vector<float>* out);
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
/*!
|
||||
* Copyright 2020-2022 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2020-2023, XGBoost Contributors
|
||||
*/
|
||||
#include <algorithm> // std::max
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#include "../../collective/communicator-inl.cuh"
|
||||
#include "../../common/categorical.h"
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "expand_entry.cuh"
|
||||
@@ -24,13 +24,11 @@
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
namespace cub = hipcub;
|
||||
#endif
|
||||
|
||||
namespace tree {
|
||||
|
||||
namespace xgboost::tree {
|
||||
// With constraints
|
||||
XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
|
||||
const GradientPairInt64 &missing,
|
||||
@@ -352,11 +350,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
|
||||
common::Span<common::CatBitField::value_type> out,
|
||||
DeviceSplitCandidate *p_out_split) {
|
||||
auto &out_split = *p_out_split;
|
||||
out_split.split_cats = common::CatBitField{out};
|
||||
auto out_cats = common::CatBitField{out};
|
||||
|
||||
// Simple case for one hot split
|
||||
if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
|
||||
out_split.split_cats.Set(common::AsCat(out_split.thresh));
|
||||
out_cats.Set(common::AsCat(out_split.thresh));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -376,7 +374,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
|
||||
assert(partition > 0 && "Invalid partition.");
|
||||
thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
|
||||
auto cat = shared_inputs.feature_values[c - node_offset];
|
||||
out_split.SetCat(cat);
|
||||
out_cats.Set(common::AsCat(cat));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -453,6 +451,24 @@ void GPUHistEvaluator::EvaluateSplits(
|
||||
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
|
||||
evaluator, out_splits);
|
||||
|
||||
if (is_column_split_) {
|
||||
// With column-wise data split, we gather the split candidates from all the workers and find the
|
||||
// global best candidates.
|
||||
auto const world_size = collective::GetWorldSize();
|
||||
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
|
||||
auto all_candidates = dh::ToSpan(all_candidate_storage);
|
||||
collective::AllGather(device_, out_splits.data(), all_candidates.data(),
|
||||
out_splits.size() * sizeof(DeviceSplitCandidate));
|
||||
|
||||
// Reduce to get the best candidate from all workers.
|
||||
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
|
||||
out_splits[i] = all_candidates[i];
|
||||
for (auto rank = 1; rank < world_size; rank++) {
|
||||
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||
auto d_entries = out_entries;
|
||||
auto device_cats_accessor = this->DeviceCatStorage(nidx);
|
||||
@@ -471,8 +487,7 @@ void GPUHistEvaluator::EvaluateSplits(
|
||||
|
||||
if (split.is_cat) {
|
||||
SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
|
||||
device_cats_accessor.GetNodeCatStorage(input.nidx),
|
||||
&out_splits[i]);
|
||||
device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
|
||||
}
|
||||
|
||||
float base_weight =
|
||||
@@ -510,6 +525,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
|
||||
#endif
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
|
||||
common::Span<const float> feature_values;
|
||||
common::Span<const float> min_fvalue;
|
||||
bool is_dense;
|
||||
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
|
||||
__device__ auto FeatureBins(bst_feature_t fidx) const {
|
||||
[[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
|
||||
[[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
|
||||
return feature_segments[fidx + 1] - feature_segments[fidx];
|
||||
}
|
||||
};
|
||||
@@ -83,6 +83,9 @@ class GPUHistEvaluator {
|
||||
// Number of elements of categorical storage type
|
||||
// needed to hold categoricals for a single mode
|
||||
std::size_t node_categorical_storage_size_ = 0;
|
||||
// Is the data split column-wise?
|
||||
bool is_column_split_ = false;
|
||||
int32_t device_;
|
||||
|
||||
// Copy the categories from device to host asynchronously.
|
||||
void CopyToHost( const std::vector<bst_node_t>& nidx);
|
||||
@@ -102,7 +105,7 @@ class GPUHistEvaluator {
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get device category storage of nidx for internal calculation.
|
||||
* @brief Get device category storage of nidx for internal calculation.
|
||||
*/
|
||||
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
|
||||
if (!has_categoricals_) return CatAccessor{};
|
||||
@@ -117,8 +120,8 @@ class GPUHistEvaluator {
|
||||
/**
|
||||
* \brief Get sorted index storage based on the left node of inputs.
|
||||
*/
|
||||
auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
|
||||
if(!need_sort_histogram_) return common::Span<bst_feature_t>();
|
||||
auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
|
||||
if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
|
||||
cat_sorted_idx_.resize(num_nodes * total_bins);
|
||||
return dh::ToSpan(cat_sorted_idx_);
|
||||
}
|
||||
@@ -136,18 +139,29 @@ class GPUHistEvaluator {
|
||||
* \brief Reset the evaluator, should be called before any use.
|
||||
*/
|
||||
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m, int32_t device);
|
||||
bst_feature_t n_features, TrainParam const ¶m, bool is_column_split,
|
||||
int32_t device);
|
||||
|
||||
/**
|
||||
* \brief Get host category storage for nidx. Different from the internal version, this
|
||||
* returns strictly 1 node.
|
||||
*/
|
||||
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
|
||||
[[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
|
||||
copy_stream_.View().Sync();
|
||||
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
|
||||
nidx * node_categorical_storage_size_, node_categorical_storage_size_);
|
||||
return cats_out;
|
||||
}
|
||||
|
||||
[[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
|
||||
copy_stream_.View().Sync();
|
||||
if (has_categoricals_) {
|
||||
CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
|
||||
return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
|
||||
} else {
|
||||
return common::KCatBitField{};
|
||||
}
|
||||
}
|
||||
/**
|
||||
* \brief Add a split to the internal tree evaluator.
|
||||
*/
|
||||
|
||||
@@ -14,10 +14,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
|
||||
common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m,
|
||||
int32_t device) {
|
||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||
bst_feature_t n_features, TrainParam const ¶m,
|
||||
bool is_column_split, int32_t device) {
|
||||
param_ = param;
|
||||
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
||||
has_categoricals_ = cuts.HasCategorical();
|
||||
@@ -93,6 +92,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
|
||||
});
|
||||
#endif
|
||||
}
|
||||
is_column_split_ = is_column_split;
|
||||
device_ = device;
|
||||
}
|
||||
|
||||
common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
||||
|
||||
@@ -8,10 +8,10 @@
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "../../common/compressed_iterator.h"
|
||||
#include "../../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../../common/random.h"
|
||||
#include "../param.h"
|
||||
@@ -146,27 +146,30 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
||||
CombineGradientPair combine_;
|
||||
};
|
||||
|
||||
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
|
||||
NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
|
||||
|
||||
GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
|
||||
GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
return {dmat->Info().num_row_, page_, gpair};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
|
||||
return {dmat->Info().num_row_, page, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
|
||||
size_t n_rows, BatchParam batch_param)
|
||||
: batch_param_{std::move(batch_param)},
|
||||
page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
|
||||
n_rows)) {}
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
|
||||
: batch_param_{std::move(batch_param)} {}
|
||||
|
||||
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
if (!page_concatenated_) {
|
||||
// Concatenate all the external memory ELLPACK pages into a single in-memory page.
|
||||
page_.reset(nullptr);
|
||||
size_t offset = 0;
|
||||
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
|
||||
auto page = batch.Impl();
|
||||
if (!page_) {
|
||||
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
|
||||
page->row_stride, dmat->Info().num_row_);
|
||||
}
|
||||
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
|
||||
offset += num_elements;
|
||||
}
|
||||
@@ -175,8 +178,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
|
||||
return {dmat->Info().num_row_, page_.get(), gpair};
|
||||
}
|
||||
|
||||
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
|
||||
: page_(page), subsample_(subsample) {}
|
||||
UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
|
||||
: batch_param_{std::move(batch_param)}, subsample_(subsample) {}
|
||||
|
||||
GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
@@ -185,7 +188,8 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
|
||||
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<std::size_t>(0),
|
||||
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
|
||||
return {dmat->Info().num_row_, page_, gpair};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
|
||||
return {dmat->Info().num_row_, page, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
|
||||
@@ -198,27 +202,27 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
|
||||
GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
// Set gradient pair to 0 with p = 1 - subsample
|
||||
thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0),
|
||||
BernoulliTrial(common::GlobalRandom()(), subsample_),
|
||||
GradientPair());
|
||||
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<std::size_t>(0),
|
||||
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair{});
|
||||
|
||||
// Count the sampled rows.
|
||||
size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
|
||||
size_t sample_rows =
|
||||
thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero{});
|
||||
|
||||
// Compact gradient pairs.
|
||||
gpair_.resize(sample_rows);
|
||||
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
|
||||
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero{});
|
||||
|
||||
// Index the sample rows.
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
|
||||
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
IsNonZero());
|
||||
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
|
||||
sample_row_index_.begin());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
|
||||
sample_row_index_.begin(),
|
||||
sample_row_index_.begin(),
|
||||
ClearEmptyRows());
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
sample_row_index_.begin(), ClearEmptyRows());
|
||||
|
||||
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
|
||||
auto first_page = (*batch_iterator.begin()).Impl();
|
||||
@@ -228,7 +232,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
first_page->row_stride, sample_rows));
|
||||
|
||||
// Compact the ELLPACK pages into the single sample page.
|
||||
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||
for (auto& batch : batch_iterator) {
|
||||
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||
}
|
||||
@@ -236,12 +240,10 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
}
|
||||
|
||||
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam&,
|
||||
GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
|
||||
float subsample)
|
||||
: page_(page),
|
||||
subsample_(subsample),
|
||||
: subsample_(subsample),
|
||||
batch_param_{std::move(batch_param)},
|
||||
threshold_(n_rows + 1, 0.0f),
|
||||
grad_sum_(n_rows, 0.0f) {}
|
||||
|
||||
@@ -252,18 +254,19 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
|
||||
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
|
||||
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
|
||||
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
|
||||
|
||||
// Perform Poisson sampling in place.
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
|
||||
RandomWeight(common::GlobalRandom()())));
|
||||
return {n_rows, page_, gpair};
|
||||
return {n_rows, page, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
|
||||
size_t n_rows,
|
||||
BatchParam batch_param,
|
||||
float subsample)
|
||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
|
||||
BatchParam batch_param,
|
||||
float subsample)
|
||||
: batch_param_(std::move(batch_param)),
|
||||
subsample_(subsample),
|
||||
threshold_(n_rows + 1, 0.0f),
|
||||
@@ -273,16 +276,15 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
|
||||
GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
|
||||
common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
size_t n_rows = dmat->Info().num_row_;
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
bst_row_t n_rows = dmat->Info().num_row_;
|
||||
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
|
||||
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
|
||||
|
||||
// Perform Poisson sampling in place.
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0),
|
||||
dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_),
|
||||
threshold_index,
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
|
||||
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
|
||||
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
|
||||
RandomWeight(common::GlobalRandom()())));
|
||||
|
||||
// Count the sampled rows.
|
||||
@@ -290,16 +292,15 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
||||
|
||||
// Compact gradient pairs.
|
||||
gpair_.resize(sample_rows);
|
||||
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
|
||||
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
|
||||
|
||||
// Index the sample rows.
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
|
||||
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
|
||||
sample_row_index_.begin());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
|
||||
sample_row_index_.begin(),
|
||||
sample_row_index_.begin(),
|
||||
ClearEmptyRows());
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
IsNonZero());
|
||||
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
|
||||
sample_row_index_.begin());
|
||||
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
|
||||
sample_row_index_.begin(), ClearEmptyRows());
|
||||
|
||||
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
|
||||
auto first_page = (*batch_iterator.begin()).Impl();
|
||||
@@ -317,13 +318,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
}
|
||||
|
||||
GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
|
||||
size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample, int sampling_method) {
|
||||
GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
|
||||
const BatchParam& batch_param, float subsample,
|
||||
int sampling_method, bool is_external_memory) {
|
||||
// The ctx is kept here for future development of stream-based operations.
|
||||
monitor_.Init("gradient_based_sampler");
|
||||
|
||||
bool is_sampling = subsample < 1.0;
|
||||
bool is_external_memory = page->n_rows != n_rows;
|
||||
|
||||
if (is_sampling) {
|
||||
switch (sampling_method) {
|
||||
@@ -331,24 +332,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new UniformSampling(page, subsample));
|
||||
strategy_.reset(new UniformSampling(batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
case TrainParam::kGradientBased:
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(
|
||||
new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
|
||||
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
|
||||
} else {
|
||||
strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
|
||||
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
|
||||
}
|
||||
break;
|
||||
default:LOG(FATAL) << "unknown sampling method";
|
||||
default:
|
||||
LOG(FATAL) << "unknown sampling method";
|
||||
}
|
||||
} else {
|
||||
if (is_external_memory) {
|
||||
strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
|
||||
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
|
||||
} else {
|
||||
strategy_.reset(new NoSampling(page));
|
||||
strategy_.reset(new NoSampling(batch_param));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -362,11 +363,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
|
||||
return sample;
|
||||
}
|
||||
|
||||
size_t GradientBasedSampler::CalculateThresholdIndex(
|
||||
common::Span<GradientPair> gpair, common::Span<float> threshold,
|
||||
common::Span<float> grad_sum, size_t sample_rows) {
|
||||
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
|
||||
std::numeric_limits<float>::max());
|
||||
size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
|
||||
common::Span<float> threshold,
|
||||
common::Span<float> grad_sum,
|
||||
size_t sample_rows) {
|
||||
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
|
||||
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
|
||||
CombineGradientPair());
|
||||
thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
|
||||
@@ -379,6 +380,5 @@ size_t GradientBasedSampler::CalculateThresholdIndex(
|
||||
thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
|
||||
return thrust::distance(dh::tbegin(grad_sum), min) + 1;
|
||||
}
|
||||
|
||||
}; // namespace tree
|
||||
}; // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2019 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/base.h>
|
||||
@@ -32,37 +32,36 @@ class SamplingStrategy {
|
||||
/*! \brief No sampling in in-memory mode. */
|
||||
class NoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit NoSampling(EllpackPageImpl const* page);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl const* page_;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||
public:
|
||||
ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
|
||||
BatchParam batch_param);
|
||||
explicit NoSampling(BatchParam batch_param);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
std::unique_ptr<EllpackPageImpl> page_;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit ExternalMemoryNoSampling(BatchParam batch_param);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
std::unique_ptr<EllpackPageImpl> page_{nullptr};
|
||||
bool page_concatenated_{false};
|
||||
};
|
||||
|
||||
/*! \brief Uniform sampling in in-memory mode. */
|
||||
class UniformSampling : public SamplingStrategy {
|
||||
public:
|
||||
UniformSampling(EllpackPageImpl const* page, float subsample);
|
||||
UniformSampling(BatchParam batch_param, float subsample);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl const* page_;
|
||||
BatchParam batch_param_;
|
||||
float subsample_;
|
||||
};
|
||||
|
||||
@@ -84,13 +83,12 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
|
||||
/*! \brief Gradient-based sampling in in-memory mode.. */
|
||||
class GradientBasedSampling : public SamplingStrategy {
|
||||
public:
|
||||
GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample);
|
||||
GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl const* page_;
|
||||
BatchParam batch_param_;
|
||||
float subsample_;
|
||||
dh::caching_device_vector<float> threshold_;
|
||||
dh::caching_device_vector<float> grad_sum_;
|
||||
@@ -106,11 +104,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
private:
|
||||
BatchParam batch_param_;
|
||||
float subsample_;
|
||||
dh::caching_device_vector<float> threshold_;
|
||||
dh::caching_device_vector<float> grad_sum_;
|
||||
dh::device_vector<float> threshold_;
|
||||
dh::device_vector<float> grad_sum_;
|
||||
std::unique_ptr<EllpackPageImpl> page_;
|
||||
dh::device_vector<GradientPair> gpair_;
|
||||
dh::caching_device_vector<size_t> sample_row_index_;
|
||||
dh::device_vector<size_t> sample_row_index_;
|
||||
};
|
||||
|
||||
/*! \brief Draw a sample of rows from a DMatrix.
|
||||
@@ -124,8 +122,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
*/
|
||||
class GradientBasedSampler {
|
||||
public:
|
||||
GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
|
||||
const BatchParam& batch_param, float subsample, int sampling_method);
|
||||
GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
|
||||
float subsample, int sampling_method, bool is_external_memory);
|
||||
|
||||
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
|
||||
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cstdint> // uint32_t
|
||||
#include <limits>
|
||||
|
||||
#include "../../collective/aggregator.h"
|
||||
#include "../../common/deterministic.cuh"
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
@@ -52,7 +53,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
|
||||
*
|
||||
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
|
||||
*/
|
||||
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
|
||||
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
|
||||
using GradientSumT = GradientPairPrecise;
|
||||
using T = typename GradientSumT::ValueT;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
@@ -70,11 +71,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
|
||||
// Treat pair as array of 4 primitive types to allreduce
|
||||
using ReduceT = typename decltype(p.first)::ValueT;
|
||||
static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
|
||||
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
|
||||
collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
|
||||
GradientPair positive_sum{p.first}, negative_sum{p.second};
|
||||
|
||||
std::size_t total_rows = gpair.size();
|
||||
collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
|
||||
collective::GlobalSum(info, &total_rows, 1);
|
||||
|
||||
auto histogram_rounding =
|
||||
GradientSumT{common::CreateRoundingFactor<T>(
|
||||
|
||||
@@ -39,7 +39,7 @@ private:
|
||||
GradientPairPrecise to_floating_point_;
|
||||
|
||||
public:
|
||||
explicit GradientQuantiser(common::Span<GradientPair const> gpair);
|
||||
GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
|
||||
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
|
||||
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
|
||||
gpair.GetHess() * to_fixed_point_.GetHess());
|
||||
|
||||
@@ -24,21 +24,13 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
|
||||
|
||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
||||
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaStreamCreate(&stream_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipStreamCreate(&stream_));
|
||||
#endif
|
||||
}
|
||||
|
||||
RowPartitioner::~RowPartitioner() {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
dh::safe_cuda(cudaStreamDestroy(stream_));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||
dh::safe_cuda(hipStreamDestroy(stream_));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -116,13 +116,7 @@ template <typename RowIndexT, typename OpT, typename OpDataT>
|
||||
void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
||||
common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
|
||||
common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
|
||||
dh::device_vector<int8_t>* tmp,
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
hipStream_t stream
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
cudaStream_t stream
|
||||
#endif
|
||||
) {
|
||||
dh::device_vector<int8_t>* tmp) {
|
||||
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
|
||||
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
|
||||
d_counts.data()};
|
||||
@@ -135,29 +129,28 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
||||
int batch_idx;
|
||||
std::size_t item_idx;
|
||||
AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
|
||||
auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
|
||||
auto op_res = op(ridx[item_idx], batch_idx, batch_info_itr[batch_idx].data);
|
||||
return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
|
||||
});
|
||||
size_t temp_bytes = 0;
|
||||
if (tmp->empty()) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
|
||||
IndexFlagOp(), total_rows, stream);
|
||||
IndexFlagOp(), total_rows);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
rocprim::inclusive_scan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
|
||||
total_rows, IndexFlagOp(), stream);
|
||||
total_rows,IndexFlagOp());
|
||||
#endif
|
||||
|
||||
tmp->resize(temp_bytes);
|
||||
}
|
||||
temp_bytes = tmp->size();
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
|
||||
discard_write_iterator, IndexFlagOp(), total_rows, stream);
|
||||
discard_write_iterator, IndexFlagOp(), total_rows);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator, discard_write_iterator,
|
||||
total_rows, IndexFlagOp(), stream);
|
||||
rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator,
|
||||
discard_write_iterator, total_rows, IndexFlagOp());
|
||||
#endif
|
||||
|
||||
constexpr int kBlockSize = 256;
|
||||
@@ -167,7 +160,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
|
||||
const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
|
||||
|
||||
SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
|
||||
<<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
|
||||
<<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
|
||||
}
|
||||
|
||||
struct NodePositionInfo {
|
||||
@@ -240,12 +233,6 @@ class RowPartitioner {
|
||||
dh::PinnedMemory pinned_;
|
||||
dh::PinnedMemory pinned2_;
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
hipStream_t stream_;
|
||||
#else
|
||||
cudaStream_t stream_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
RowPartitioner(int device_idx, size_t num_rows);
|
||||
~RowPartitioner();
|
||||
@@ -303,11 +290,11 @@ class RowPartitioner {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
|
||||
hipMemcpyDefault, stream_));
|
||||
hipMemcpyDefault));
|
||||
#else
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
|
||||
h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
|
||||
cudaMemcpyDefault, stream_));
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
|
||||
// Temporary arrays
|
||||
@@ -317,23 +304,17 @@ class RowPartitioner {
|
||||
// Partition the rows according to the operator
|
||||
SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
|
||||
dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
|
||||
total_rows, op, &tmp_, stream_);
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
hipMemcpyDefault, stream_));
|
||||
#else
|
||||
total_rows, op, &tmp_);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
cudaMemcpyDefault, stream_));
|
||||
cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
|
||||
hipMemcpyDefault));
|
||||
#endif
|
||||
|
||||
// TODO(Rory): this synchronisation hurts performance a lot
|
||||
// Future optimisation should find a way to skip this
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipStreamSynchronize(stream_));
|
||||
#else
|
||||
dh::safe_cuda(cudaStreamSynchronize(stream_));
|
||||
#endif
|
||||
dh::DefaultStream().Sync();
|
||||
|
||||
// Update segments
|
||||
for (size_t i = 0; i < nidx.size(); i++) {
|
||||
@@ -370,18 +351,18 @@ class RowPartitioner {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
|
||||
sizeof(NodePositionInfo) * ridx_segments_.size(),
|
||||
hipMemcpyDefault, stream_));
|
||||
hipMemcpyDefault));
|
||||
#else
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
|
||||
sizeof(NodePositionInfo) * ridx_segments_.size(),
|
||||
cudaMemcpyDefault, stream_));
|
||||
cudaMemcpyDefault));
|
||||
#endif
|
||||
|
||||
constexpr int kBlockSize = 512;
|
||||
const int kItemsThread = 8;
|
||||
const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
|
||||
common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
|
||||
FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
|
||||
FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
|
||||
dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
#ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
|
||||
#define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
|
||||
|
||||
#include <algorithm> // for copy
|
||||
#include <cstddef> // for size_t
|
||||
#include <limits> // for numeric_limits
|
||||
#include <memory> // for shared_ptr
|
||||
#include <numeric> // for accumulate
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
#include <algorithm> // for copy
|
||||
#include <cstddef> // for size_t
|
||||
#include <limits> // for numeric_limits
|
||||
#include <memory> // for shared_ptr
|
||||
#include <numeric> // for accumulate
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../common/categorical.h" // for CatBitField
|
||||
#include "../../common/hist_util.h" // for GHistRow, HistogramCuts
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "../param.h" // for TrainParam
|
||||
#include "../split_evaluator.h" // for TreeEvaluator
|
||||
#include "expand_entry.h" // for MultiExpandEntry
|
||||
#include "hist_cache.h" // for BoundedHistCollection
|
||||
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_feature_t
|
||||
#include "xgboost/context.h" // for COntext
|
||||
#include "xgboost/linalg.h" // for Constants, Vector
|
||||
@@ -65,7 +66,7 @@ class HistEvaluator {
|
||||
* pseudo-category for missing value but here we just do a complete scan to avoid
|
||||
* making specialized histogram bin.
|
||||
*/
|
||||
void EnumerateOneHot(common::HistogramCuts const &cut, const common::GHistRow &hist,
|
||||
void EnumerateOneHot(common::HistogramCuts const &cut, common::ConstGHistRow hist,
|
||||
bst_feature_t fidx, bst_node_t nidx,
|
||||
TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
|
||||
SplitEntry *p_best) const {
|
||||
@@ -143,7 +144,7 @@ class HistEvaluator {
|
||||
*/
|
||||
template <int d_step>
|
||||
void EnumeratePart(common::HistogramCuts const &cut, common::Span<size_t const> sorted_idx,
|
||||
common::GHistRow const &hist, bst_feature_t fidx, bst_node_t nidx,
|
||||
common::ConstGHistRow hist, bst_feature_t fidx, bst_node_t nidx,
|
||||
TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
|
||||
SplitEntry *p_best) {
|
||||
static_assert(d_step == +1 || d_step == -1, "Invalid step.");
|
||||
@@ -214,7 +215,7 @@ class HistEvaluator {
|
||||
// Returns the sum of gradients corresponding to the data points that contains
|
||||
// a non-missing value for the particular feature fid.
|
||||
template <int d_step>
|
||||
GradStats EnumerateSplit(common::HistogramCuts const &cut, const common::GHistRow &hist,
|
||||
GradStats EnumerateSplit(common::HistogramCuts const &cut, common::ConstGHistRow hist,
|
||||
bst_feature_t fidx, bst_node_t nidx,
|
||||
TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
|
||||
SplitEntry *p_best) const {
|
||||
@@ -317,7 +318,7 @@ class HistEvaluator {
|
||||
}
|
||||
|
||||
public:
|
||||
void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
|
||||
void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
|
||||
common::Span<FeatureType const> feature_types, const RegTree &tree,
|
||||
std::vector<CPUExpandEntry> *p_entries) {
|
||||
auto n_threads = ctx_->Threads();
|
||||
@@ -454,8 +455,8 @@ class HistEvaluator {
|
||||
right_child);
|
||||
}
|
||||
|
||||
auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
|
||||
auto const& Stats() const { return snode_; }
|
||||
[[nodiscard]] auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
|
||||
[[nodiscard]] auto const &Stats() const { return snode_; }
|
||||
|
||||
float InitRoot(GradStats const &root_sum) {
|
||||
snode_.resize(1);
|
||||
@@ -510,7 +511,7 @@ class HistMultiEvaluator {
|
||||
|
||||
template <bst_bin_t d_step>
|
||||
bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
|
||||
common::Span<common::GHistRow const> hist,
|
||||
common::Span<common::ConstGHistRow> hist,
|
||||
linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
|
||||
SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
|
||||
auto const &cut_ptr = cut.Ptrs();
|
||||
@@ -623,7 +624,7 @@ class HistMultiEvaluator {
|
||||
}
|
||||
|
||||
public:
|
||||
void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
|
||||
void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
|
||||
common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
|
||||
auto &entries = *p_entries;
|
||||
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
|
||||
@@ -651,9 +652,9 @@ class HistMultiEvaluator {
|
||||
auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
|
||||
auto best = &entry->split;
|
||||
auto parent_sum = stats_.Slice(entry->nid, linalg::All());
|
||||
std::vector<common::GHistRow> node_hist;
|
||||
std::vector<common::ConstGHistRow> node_hist;
|
||||
for (auto t_hist : hist) {
|
||||
node_hist.push_back((*t_hist)[entry->nid]);
|
||||
node_hist.emplace_back((*t_hist)[entry->nid]);
|
||||
}
|
||||
auto features_set = features[nidx_in_set]->ConstHostSpan();
|
||||
|
||||
@@ -773,7 +774,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
|
||||
std::vector<Partitioner> const &partitioner,
|
||||
linalg::VectorView<float> out_preds) {
|
||||
auto const &tree = *p_last_tree;
|
||||
CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
|
||||
CHECK(out_preds.Device().IsCPU());
|
||||
size_t n_nodes = p_last_tree->GetNodes().size();
|
||||
for (auto &part : partitioner) {
|
||||
CHECK_EQ(part.Size(), n_nodes);
|
||||
@@ -808,7 +809,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
|
||||
auto n_nodes = mttree->Size();
|
||||
auto n_targets = tree.NumTargets();
|
||||
CHECK_EQ(out_preds.Shape(1), n_targets);
|
||||
CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
|
||||
CHECK(out_preds.Device().IsCPU());
|
||||
|
||||
for (auto &part : partitioner) {
|
||||
CHECK_EQ(part.Size(), n_nodes);
|
||||
|
||||
@@ -18,8 +18,8 @@ namespace xgboost::tree {
|
||||
*/
|
||||
template <typename Impl>
|
||||
struct ExpandEntryImpl {
|
||||
bst_node_t nid;
|
||||
bst_node_t depth;
|
||||
bst_node_t nid{0};
|
||||
bst_node_t depth{0};
|
||||
|
||||
[[nodiscard]] float GetLossChange() const {
|
||||
return static_cast<Impl const*>(this)->split.loss_chg;
|
||||
|
||||
113
src/tree/hist/hist_cache.h
Normal file
113
src/tree/hist/hist_cache.h
Normal file
@@ -0,0 +1,113 @@
|
||||
/**
|
||||
* Copyright 2023 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
|
||||
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
|
||||
#include <cstddef> // for size_t
|
||||
#include <map> // for map
|
||||
#include <memory> // for unique_ptr
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
|
||||
#include "../../common/ref_resource_view.h" // for ReallocVector
|
||||
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
|
||||
#include "xgboost/logging.h" // for CHECK_GT
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::tree {
|
||||
/**
|
||||
* @brief A persistent cache for CPU histogram.
|
||||
*
|
||||
* The size of the cache is first bounded by the `Driver` class then by this cache
|
||||
* implementaiton. The former limits the number of nodes that can be built for each node
|
||||
* batch, while this cache limits the number of all nodes up to the size of
|
||||
* max(|node_batch|, n_cached_node).
|
||||
*
|
||||
* The caller is responsible for clearing up the cache as it needs to rearrange the
|
||||
* nodes before making overflowed allocations. The strcut only reports whether the size
|
||||
* limit has benn reached.
|
||||
*/
|
||||
class BoundedHistCollection {
|
||||
// maps node index to offset in `data_`.
|
||||
std::map<bst_node_t, std::size_t> node_map_;
|
||||
// currently allocated bins, used for tracking consistentcy.
|
||||
std::size_t current_size_{0};
|
||||
|
||||
// stores the histograms in a contiguous buffer
|
||||
using Vec = common::ReallocVector<GradientPairPrecise>;
|
||||
std::unique_ptr<Vec> data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique
|
||||
|
||||
// number of histogram bins across all features
|
||||
bst_bin_t n_total_bins_{0};
|
||||
// limits the number of nodes that can be in the cache for each tree
|
||||
std::size_t n_cached_nodes_{0};
|
||||
// whether the tree has grown beyond the cache limit
|
||||
bool has_exceeded_{false};
|
||||
|
||||
public:
|
||||
BoundedHistCollection() = default;
|
||||
common::GHistRow operator[](std::size_t idx) {
|
||||
auto offset = node_map_.at(idx);
|
||||
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
|
||||
}
|
||||
common::ConstGHistRow operator[](std::size_t idx) const {
|
||||
auto offset = node_map_.at(idx);
|
||||
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
|
||||
}
|
||||
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
|
||||
n_total_bins_ = n_total_bins;
|
||||
n_cached_nodes_ = n_cached_nodes;
|
||||
this->Clear(false);
|
||||
}
|
||||
/**
|
||||
* @brief Clear the cache, mark whether the cache is exceeded the limit.
|
||||
*/
|
||||
void Clear(bool exceeded) {
|
||||
node_map_.clear();
|
||||
current_size_ = 0;
|
||||
has_exceeded_ = exceeded;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
|
||||
common::Span<bst_node_t const> nodes_to_sub) const {
|
||||
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
|
||||
return n_new_nodes + node_map_.size() <= n_cached_nodes_;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocate histogram buffers for all nodes.
|
||||
*
|
||||
* The resulting histogram buffer is contiguous for all nodes in the order of
|
||||
* allocation.
|
||||
*/
|
||||
void AllocateHistograms(common::Span<bst_node_t const> nodes_to_build,
|
||||
common::Span<bst_node_t const> nodes_to_sub) {
|
||||
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
|
||||
auto alloc_size = n_new_nodes * n_total_bins_;
|
||||
auto new_size = alloc_size + current_size_;
|
||||
if (new_size > data_->size()) {
|
||||
data_->Resize(new_size);
|
||||
}
|
||||
for (auto nidx : nodes_to_build) {
|
||||
node_map_[nidx] = current_size_;
|
||||
current_size_ += n_total_bins_;
|
||||
}
|
||||
for (auto nidx : nodes_to_sub) {
|
||||
node_map_[nidx] = current_size_;
|
||||
current_size_ += n_total_bins_;
|
||||
}
|
||||
CHECK_EQ(current_size_, new_size);
|
||||
}
|
||||
void AllocateHistograms(std::vector<bst_node_t> const& nodes) {
|
||||
this->AllocateHistograms(common::Span<bst_node_t const>{nodes},
|
||||
common::Span<bst_node_t const>{});
|
||||
}
|
||||
|
||||
[[nodiscard]] bool HasExceeded() const { return has_exceeded_; }
|
||||
[[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
|
||||
return node_map_.find(nidx) != node_map_.cend();
|
||||
}
|
||||
[[nodiscard]] std::size_t Size() const { return current_size_; }
|
||||
};
|
||||
} // namespace xgboost::tree
|
||||
#endif // XGBOOST_TREE_HIST_HIST_CACHE_H_
|
||||
63
src/tree/hist/histogram.cc
Normal file
63
src/tree/hist/histogram.cc
Normal file
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* Copyright 2023 by XGBoost Contributors
|
||||
*/
|
||||
#include "histogram.h"
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <numeric> // for accumulate
|
||||
#include <utility> // for swap
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../common/transform_iterator.h" // for MakeIndexTransformIter
|
||||
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
|
||||
#include "xgboost/logging.h" // for CHECK_NE
|
||||
#include "xgboost/span.h" // for Span
|
||||
#include "xgboost/tree_model.h" // for RegTree
|
||||
|
||||
namespace xgboost::tree {
|
||||
void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
|
||||
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
|
||||
CHECK_EQ(nodes_to_build.size(), valid_candidates.size());
|
||||
|
||||
std::size_t n_idx = 0;
|
||||
for (auto const &c : valid_candidates) {
|
||||
auto left_nidx = p_tree->LeftChild(c.nid);
|
||||
auto right_nidx = p_tree->RightChild(c.nid);
|
||||
|
||||
auto build_nidx = left_nidx;
|
||||
auto subtract_nidx = right_nidx;
|
||||
auto lit =
|
||||
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
|
||||
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
|
||||
auto rit =
|
||||
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
|
||||
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
|
||||
auto fewer_right = right_sum < left_sum;
|
||||
if (fewer_right) {
|
||||
std::swap(build_nidx, subtract_nidx);
|
||||
}
|
||||
nodes_to_build[n_idx] = build_nidx;
|
||||
nodes_to_sub[n_idx] = subtract_nidx;
|
||||
++n_idx;
|
||||
}
|
||||
}
|
||||
|
||||
void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
|
||||
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
|
||||
std::size_t n_idx = 0;
|
||||
for (auto const &c : candidates) {
|
||||
auto left_nidx = (*p_tree)[c.nid].LeftChild();
|
||||
auto right_nidx = (*p_tree)[c.nid].RightChild();
|
||||
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
|
||||
|
||||
auto build_nidx = left_nidx;
|
||||
auto subtract_nidx = right_nidx;
|
||||
if (fewer_right) {
|
||||
std::swap(build_nidx, subtract_nidx);
|
||||
}
|
||||
nodes_to_build[n_idx] = build_nidx;
|
||||
nodes_to_sub[n_idx] = subtract_nidx;
|
||||
++n_idx;
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
@@ -4,316 +4,229 @@
|
||||
#ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
|
||||
#define XGBOOST_TREE_HIST_HISTOGRAM_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <algorithm> // for max
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t
|
||||
#include <functional> // for function
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../collective/communicator-inl.h"
|
||||
#include "../../common/hist_util.h"
|
||||
#include "../../data/gradient_index.h"
|
||||
#include "expand_entry.h"
|
||||
#include "xgboost/tree_model.h" // for RegTree
|
||||
#include "../../collective/communicator-inl.h" // for Allreduce
|
||||
#include "../../collective/communicator.h" // for Operation
|
||||
#include "../../common/hist_util.h" // for GHistRow, ParallelGHi...
|
||||
#include "../../common/row_set.h" // for RowSetCollection
|
||||
#include "../../common/threading_utils.h" // for ParallelFor2d, Range1d, BlockedSpace2d
|
||||
#include "../../data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
|
||||
#include "hist_cache.h" // for BoundedHistCollection
|
||||
#include "param.h" // for HistMakerTrainParam
|
||||
#include "xgboost/base.h" // for bst_node_t, bst_target_t, bst_bin_t
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for BatchIterator, BatchSet
|
||||
#include "xgboost/linalg.h" // for MatrixView, All, Vect...
|
||||
#include "xgboost/logging.h" // for CHECK_GE
|
||||
#include "xgboost/span.h" // for Span
|
||||
#include "xgboost/tree_model.h" // for RegTree
|
||||
|
||||
namespace xgboost::tree {
|
||||
/**
|
||||
* @brief Decide which node as the build node for multi-target trees.
|
||||
*/
|
||||
void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
|
||||
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
|
||||
|
||||
/**
|
||||
* @brief Decide which node as the build node.
|
||||
*/
|
||||
void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
|
||||
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
template <typename ExpandEntry>
|
||||
class HistogramBuilder {
|
||||
/*! \brief culmulative histogram of gradients. */
|
||||
common::HistCollection hist_;
|
||||
/*! \brief culmulative local parent histogram of gradients. */
|
||||
common::HistCollection hist_local_worker_;
|
||||
common::GHistBuilder builder_;
|
||||
BoundedHistCollection hist_;
|
||||
common::ParallelGHistBuilder buffer_;
|
||||
BatchParam param_;
|
||||
int32_t n_threads_{-1};
|
||||
size_t n_batches_{0};
|
||||
// Whether XGBoost is running in distributed environment.
|
||||
bool is_distributed_{false};
|
||||
bool is_col_split_{false};
|
||||
|
||||
public:
|
||||
/**
|
||||
* \param total_bins Total number of bins across all features
|
||||
* \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
|
||||
* training parameter.
|
||||
* \param n_threads Number of threads.
|
||||
* \param is_distributed Mostly used for testing to allow injecting parameters instead
|
||||
* @brief Reset the builder, should be called before growing a new tree.
|
||||
*
|
||||
* @param total_bins Total number of bins across all features
|
||||
* @param is_distributed Mostly used for testing to allow injecting parameters instead
|
||||
* of using global rabit variable.
|
||||
*/
|
||||
void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
|
||||
bool is_distributed, bool is_col_split) {
|
||||
CHECK_GE(n_threads, 1);
|
||||
n_threads_ = n_threads;
|
||||
n_batches_ = n_batches;
|
||||
void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed,
|
||||
bool is_col_split, HistMakerTrainParam const *param) {
|
||||
n_threads_ = ctx->Threads();
|
||||
param_ = p;
|
||||
hist_.Init(total_bins);
|
||||
hist_local_worker_.Init(total_bins);
|
||||
hist_.Reset(total_bins, param->max_cached_hist_node);
|
||||
buffer_.Init(total_bins);
|
||||
builder_ = common::GHistBuilder(total_bins);
|
||||
is_distributed_ = is_distributed;
|
||||
is_col_split_ = is_col_split;
|
||||
// Workaround s390x gcc 7.5.0
|
||||
auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
|
||||
}
|
||||
|
||||
template <bool any_missing>
|
||||
void BuildLocalHistograms(size_t page_idx, common::BlockedSpace2d space,
|
||||
GHistIndexMatrix const &gidx,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
void BuildLocalHistograms(common::BlockedSpace2d const &space, GHistIndexMatrix const &gidx,
|
||||
std::vector<bst_node_t> const &nodes_to_build,
|
||||
common::RowSetCollection const &row_set_collection,
|
||||
common::Span<GradientPair const> gpair_h, bool force_read_by_column) {
|
||||
const size_t n_nodes = nodes_for_explicit_hist_build.size();
|
||||
CHECK_GT(n_nodes, 0);
|
||||
|
||||
std::vector<common::GHistRow> target_hists(n_nodes);
|
||||
for (size_t i = 0; i < n_nodes; ++i) {
|
||||
auto const nidx = nodes_for_explicit_hist_build[i].nid;
|
||||
target_hists[i] = hist_[nidx];
|
||||
}
|
||||
if (page_idx == 0) {
|
||||
// FIXME(jiamingy): Handle different size of space. Right now we use the maximum
|
||||
// partition size for the buffer, which might not be efficient if partition sizes
|
||||
// has significant variance.
|
||||
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
|
||||
}
|
||||
|
||||
// Parallel processing by nodes and data in each node
|
||||
common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
|
||||
const auto tid = static_cast<unsigned>(omp_get_thread_num());
|
||||
const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
|
||||
auto elem = row_set_collection[nid];
|
||||
bst_node_t const nidx = nodes_to_build[nid_in_set];
|
||||
auto elem = row_set_collection[nidx];
|
||||
auto start_of_row_set = std::min(r.begin(), elem.Size());
|
||||
auto end_of_row_set = std::min(r.end(), elem.Size());
|
||||
auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
|
||||
elem.begin + end_of_row_set, nid);
|
||||
elem.begin + end_of_row_set, nidx);
|
||||
auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
|
||||
if (rid_set.Size() != 0) {
|
||||
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
|
||||
force_read_by_column);
|
||||
common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void AddHistRows(int *starting_index, int *sync_count,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
RegTree const *p_tree) {
|
||||
if (is_distributed_ && !is_col_split_) {
|
||||
this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
|
||||
nodes_for_subtraction_trick, p_tree);
|
||||
} else {
|
||||
this->AddHistRowsLocal(starting_index, sync_count, nodes_for_explicit_hist_build,
|
||||
nodes_for_subtraction_trick);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Allocate histogram, rearrange the nodes if `rearrange` is true and the tree
|
||||
* has reached the cache size limit.
|
||||
*/
|
||||
void AddHistRows(RegTree const *p_tree, std::vector<bst_node_t> *p_nodes_to_build,
|
||||
std::vector<bst_node_t> *p_nodes_to_sub, bool rearrange) {
|
||||
CHECK(p_nodes_to_build);
|
||||
auto &nodes_to_build = *p_nodes_to_build;
|
||||
CHECK(p_nodes_to_sub);
|
||||
auto &nodes_to_sub = *p_nodes_to_sub;
|
||||
|
||||
/** Main entry point of this class, build histogram for tree nodes. */
|
||||
void BuildHist(size_t page_id, common::BlockedSpace2d space, GHistIndexMatrix const &gidx,
|
||||
RegTree const *p_tree, common::RowSetCollection const &row_set_collection,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
|
||||
int starting_index = std::numeric_limits<int>::max();
|
||||
int sync_count = 0;
|
||||
if (page_id == 0) {
|
||||
this->AddHistRows(&starting_index, &sync_count, nodes_for_explicit_hist_build,
|
||||
nodes_for_subtraction_trick, p_tree);
|
||||
}
|
||||
if (gidx.IsDense()) {
|
||||
this->BuildLocalHistograms<false>(page_id, space, gidx, nodes_for_explicit_hist_build,
|
||||
row_set_collection, gpair, force_read_by_column);
|
||||
} else {
|
||||
this->BuildLocalHistograms<true>(page_id, space, gidx, nodes_for_explicit_hist_build,
|
||||
row_set_collection, gpair, force_read_by_column);
|
||||
// We first check whether the cache size is already exceeded or about to be exceeded.
|
||||
// If not, then we can allocate histograms without clearing the cache and without
|
||||
// worrying about missing parent histogram.
|
||||
//
|
||||
// Otherwise, we need to rearrange the nodes before the allocation to make sure the
|
||||
// resulting buffer is contiguous. This is to facilitate efficient allreduce.
|
||||
|
||||
bool can_host = this->hist_.CanHost(nodes_to_build, nodes_to_sub);
|
||||
// True if the tree is still within the size of cache limit. Allocate histogram as
|
||||
// usual.
|
||||
auto cache_is_valid = can_host && !this->hist_.HasExceeded();
|
||||
|
||||
if (!can_host) {
|
||||
this->hist_.Clear(true);
|
||||
}
|
||||
|
||||
CHECK_GE(n_batches_, 1);
|
||||
if (page_id != n_batches_ - 1) {
|
||||
if (!rearrange || cache_is_valid) {
|
||||
// If not rearrange, we allocate the histogram as usual, assuming the nodes have
|
||||
// been properly arranged by other builders.
|
||||
this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
|
||||
if (rearrange) {
|
||||
CHECK(!this->hist_.HasExceeded());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_distributed_ && !is_col_split_) {
|
||||
this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
|
||||
nodes_for_subtraction_trick,
|
||||
starting_index, sync_count);
|
||||
// The cache is full, parent histogram might be removed in previous iterations to
|
||||
// saved memory.
|
||||
std::vector<bst_node_t> can_subtract;
|
||||
for (auto const &v : nodes_to_sub) {
|
||||
if (this->hist_.HistogramExists(p_tree->Parent(v))) {
|
||||
// We can still use the subtraction trick for this node
|
||||
can_subtract.push_back(v);
|
||||
} else {
|
||||
// This node requires a full build
|
||||
nodes_to_build.push_back(v);
|
||||
}
|
||||
}
|
||||
|
||||
nodes_to_sub = std::move(can_subtract);
|
||||
this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
|
||||
}
|
||||
|
||||
/** Main entry point of this class, build histogram for tree nodes. */
|
||||
void BuildHist(std::size_t page_idx, common::BlockedSpace2d const &space,
|
||||
GHistIndexMatrix const &gidx, common::RowSetCollection const &row_set_collection,
|
||||
std::vector<bst_node_t> const &nodes_to_build,
|
||||
linalg::VectorView<GradientPair const> gpair, bool force_read_by_column = false) {
|
||||
CHECK(gpair.Contiguous());
|
||||
|
||||
if (page_idx == 0) {
|
||||
// Add the local histogram cache to the parallel buffer before processing the first page.
|
||||
auto n_nodes = nodes_to_build.size();
|
||||
std::vector<common::GHistRow> target_hists(n_nodes);
|
||||
for (size_t i = 0; i < n_nodes; ++i) {
|
||||
auto const nidx = nodes_to_build[i];
|
||||
target_hists[i] = hist_[nidx];
|
||||
}
|
||||
buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
|
||||
}
|
||||
|
||||
if (gidx.IsDense()) {
|
||||
this->BuildLocalHistograms<false>(space, gidx, nodes_to_build, row_set_collection,
|
||||
gpair.Values(), force_read_by_column);
|
||||
} else {
|
||||
this->SyncHistogramLocal(p_tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
|
||||
this->BuildLocalHistograms<true>(space, gidx, nodes_to_build, row_set_collection,
|
||||
gpair.Values(), force_read_by_column);
|
||||
}
|
||||
}
|
||||
/** same as the other build hist but handles only single batch data (in-core) */
|
||||
void BuildHist(size_t page_id, GHistIndexMatrix const &gidx, RegTree *p_tree,
|
||||
common::RowSetCollection const &row_set_collection,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
|
||||
const size_t n_nodes = nodes_for_explicit_hist_build.size();
|
||||
// create space of size (# rows in each node)
|
||||
|
||||
void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
|
||||
std::vector<bst_node_t> const &nodes_to_trick) {
|
||||
auto n_total_bins = buffer_.TotalBins();
|
||||
common::BlockedSpace2d space(
|
||||
n_nodes,
|
||||
[&](size_t nidx_in_set) {
|
||||
const int32_t nidx = nodes_for_explicit_hist_build[nidx_in_set].nid;
|
||||
return row_set_collection[nidx].Size();
|
||||
},
|
||||
256);
|
||||
this->BuildHist(page_id, space, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build,
|
||||
nodes_for_subtraction_trick, gpair, force_read_by_column);
|
||||
}
|
||||
|
||||
void SyncHistogramDistributed(RegTree const *p_tree,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
int starting_index, int sync_count) {
|
||||
const size_t nbins = builder_.GetNumBins();
|
||||
common::BlockedSpace2d space(
|
||||
nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
|
||||
common::ParallelFor2d(space, n_threads_, [&](size_t node, common::Range1d r) {
|
||||
const auto &entry = nodes_for_explicit_hist_build[node];
|
||||
auto this_hist = this->hist_[entry.nid];
|
||||
// Merging histograms from each thread into once
|
||||
buffer_.ReduceHist(node, r.begin(), r.end());
|
||||
// Store posible parent node
|
||||
auto this_local = hist_local_worker_[entry.nid];
|
||||
common::CopyHist(this_local, this_hist, r.begin(), r.end());
|
||||
|
||||
if (!p_tree->IsRoot(entry.nid)) {
|
||||
const size_t parent_id = p_tree->Parent(entry.nid);
|
||||
const int subtraction_node_id = nodes_for_subtraction_trick[node].nid;
|
||||
auto parent_hist = this->hist_local_worker_[parent_id];
|
||||
auto sibling_hist = this->hist_[subtraction_node_id];
|
||||
common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
|
||||
// Store posible parent node
|
||||
auto sibling_local = hist_local_worker_[subtraction_node_id];
|
||||
common::CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
|
||||
}
|
||||
});
|
||||
|
||||
collective::Allreduce<collective::Operation::kSum>(
|
||||
reinterpret_cast<double *>(this->hist_[starting_index].data()),
|
||||
builder_.GetNumBins() * sync_count * 2);
|
||||
|
||||
ParallelSubtractionHist(space, nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
|
||||
p_tree);
|
||||
|
||||
common::BlockedSpace2d space2(
|
||||
nodes_for_subtraction_trick.size(), [&](size_t) { return nbins; }, 1024);
|
||||
ParallelSubtractionHist(space2, nodes_for_subtraction_trick, nodes_for_explicit_hist_build,
|
||||
p_tree);
|
||||
}
|
||||
|
||||
void SyncHistogramLocal(RegTree const *p_tree,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
|
||||
const size_t nbins = this->builder_.GetNumBins();
|
||||
common::BlockedSpace2d space(
|
||||
nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
|
||||
|
||||
nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
|
||||
common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
|
||||
const auto &entry = nodes_for_explicit_hist_build[node];
|
||||
auto this_hist = this->hist_[entry.nid];
|
||||
// Merging histograms from each thread into once
|
||||
// Merging histograms from each thread.
|
||||
this->buffer_.ReduceHist(node, r.begin(), r.end());
|
||||
|
||||
if (!p_tree->IsRoot(entry.nid)) {
|
||||
auto const parent_id = p_tree->Parent(entry.nid);
|
||||
auto const subtraction_node_id = nodes_for_subtraction_trick[node].nid;
|
||||
auto parent_hist = this->hist_[parent_id];
|
||||
auto sibling_hist = this->hist_[subtraction_node_id];
|
||||
common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
|
||||
}
|
||||
});
|
||||
if (is_distributed_ && !is_col_split_) {
|
||||
// The cache is contiguous, we can perform allreduce for all nodes in one go.
|
||||
CHECK(!nodes_to_build.empty());
|
||||
auto first_nidx = nodes_to_build.front();
|
||||
std::size_t n = n_total_bins * nodes_to_build.size() * 2;
|
||||
collective::Allreduce<collective::Operation::kSum>(
|
||||
reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
|
||||
}
|
||||
|
||||
common::BlockedSpace2d const &subspace =
|
||||
nodes_to_trick.size() == nodes_to_build.size()
|
||||
? space
|
||||
: common::BlockedSpace2d{nodes_to_trick.size(),
|
||||
[&](std::size_t) { return n_total_bins; }, 1024};
|
||||
common::ParallelFor2d(
|
||||
subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
|
||||
auto subtraction_nidx = nodes_to_trick[nidx_in_set];
|
||||
auto parent_id = p_tree->Parent(subtraction_nidx);
|
||||
auto sibling_nidx = p_tree->IsLeftChild(subtraction_nidx) ? p_tree->RightChild(parent_id)
|
||||
: p_tree->LeftChild(parent_id);
|
||||
auto sibling_hist = this->hist_[sibling_nidx];
|
||||
auto parent_hist = this->hist_[parent_id];
|
||||
auto subtract_hist = this->hist_[subtraction_nidx];
|
||||
common::SubtractionHist(subtract_hist, parent_hist, sibling_hist, r.begin(), r.end());
|
||||
});
|
||||
}
|
||||
|
||||
public:
|
||||
/* Getters for tests. */
|
||||
common::HistCollection const &Histogram() { return hist_; }
|
||||
auto& Buffer() { return buffer_; }
|
||||
|
||||
private:
|
||||
void
|
||||
ParallelSubtractionHist(const common::BlockedSpace2d &space,
|
||||
const std::vector<ExpandEntry> &nodes,
|
||||
const std::vector<ExpandEntry> &subtraction_nodes,
|
||||
const RegTree *p_tree) {
|
||||
common::ParallelFor2d(
|
||||
space, this->n_threads_, [&](size_t node, common::Range1d r) {
|
||||
const auto &entry = nodes[node];
|
||||
if (!(p_tree->IsLeftChild(entry.nid))) {
|
||||
auto this_hist = this->hist_[entry.nid];
|
||||
|
||||
if (!p_tree->IsRoot(entry.nid)) {
|
||||
const int subtraction_node_id = subtraction_nodes[node].nid;
|
||||
auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()];
|
||||
auto sibling_hist = hist_[subtraction_node_id];
|
||||
common::SubtractionHist(this_hist, parent_hist, sibling_hist,
|
||||
r.begin(), r.end());
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Add a tree node to histogram buffer in local training environment.
|
||||
void AddHistRowsLocal(
|
||||
int *starting_index, int *sync_count,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
|
||||
for (auto const &entry : nodes_for_explicit_hist_build) {
|
||||
int nid = entry.nid;
|
||||
this->hist_.AddHistRow(nid);
|
||||
(*starting_index) = std::min(nid, (*starting_index));
|
||||
}
|
||||
(*sync_count) = nodes_for_explicit_hist_build.size();
|
||||
|
||||
for (auto const &node : nodes_for_subtraction_trick) {
|
||||
this->hist_.AddHistRow(node.nid);
|
||||
}
|
||||
this->hist_.AllocateAllData();
|
||||
}
|
||||
|
||||
void AddHistRowsDistributed(int *starting_index, int *sync_count,
|
||||
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
|
||||
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
|
||||
RegTree const *p_tree) {
|
||||
const size_t explicit_size = nodes_for_explicit_hist_build.size();
|
||||
const size_t subtaction_size = nodes_for_subtraction_trick.size();
|
||||
std::vector<int> merged_node_ids(explicit_size + subtaction_size);
|
||||
for (size_t i = 0; i < explicit_size; ++i) {
|
||||
merged_node_ids[i] = nodes_for_explicit_hist_build[i].nid;
|
||||
}
|
||||
for (size_t i = 0; i < subtaction_size; ++i) {
|
||||
merged_node_ids[explicit_size + i] = nodes_for_subtraction_trick[i].nid;
|
||||
}
|
||||
std::sort(merged_node_ids.begin(), merged_node_ids.end());
|
||||
int n_left = 0;
|
||||
for (auto const &nid : merged_node_ids) {
|
||||
if (p_tree->IsLeftChild(nid)) {
|
||||
this->hist_.AddHistRow(nid);
|
||||
(*starting_index) = std::min(nid, (*starting_index));
|
||||
n_left++;
|
||||
this->hist_local_worker_.AddHistRow(nid);
|
||||
}
|
||||
}
|
||||
for (auto const &nid : merged_node_ids) {
|
||||
if (!(p_tree->IsLeftChild(nid))) {
|
||||
this->hist_.AddHistRow(nid);
|
||||
this->hist_local_worker_.AddHistRow(nid);
|
||||
}
|
||||
}
|
||||
this->hist_.AllocateAllData();
|
||||
this->hist_local_worker_.AllocateAllData();
|
||||
(*sync_count) = std::max(1, n_left);
|
||||
}
|
||||
[[nodiscard]] BoundedHistCollection const &Histogram() const { return hist_; }
|
||||
[[nodiscard]] BoundedHistCollection &Histogram() { return hist_; }
|
||||
auto &Buffer() { return buffer_; }
|
||||
};
|
||||
|
||||
// Construct a work space for building histogram. Eventually we should move this
|
||||
// function into histogram builder once hist tree method supports external memory.
|
||||
template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
|
||||
template <typename Partitioner>
|
||||
common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
|
||||
std::vector<ExpandEntry> const &nodes_to_build) {
|
||||
std::vector<size_t> partition_size(nodes_to_build.size(), 0);
|
||||
std::vector<bst_node_t> const &nodes_to_build) {
|
||||
// FIXME(jiamingy): Handle different size of space. Right now we use the maximum
|
||||
// partition size for the buffer, which might not be efficient if partition sizes
|
||||
// has significant variance.
|
||||
std::vector<std::size_t> partition_size(nodes_to_build.size(), 0);
|
||||
for (auto const &partition : partitioners) {
|
||||
size_t k = 0;
|
||||
for (auto node : nodes_to_build) {
|
||||
auto n_rows_in_node = partition.Partitions()[node.nid].Size();
|
||||
for (auto nidx : nodes_to_build) {
|
||||
auto n_rows_in_node = partition.Partitions()[nidx].Size();
|
||||
partition_size[k] = std::max(partition_size[k], n_rows_in_node);
|
||||
k++;
|
||||
}
|
||||
@@ -322,6 +235,107 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
|
||||
nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
|
||||
return space;
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
/**
|
||||
* @brief Histogram builder that can handle multiple targets.
|
||||
*/
|
||||
class MultiHistogramBuilder {
|
||||
std::vector<HistogramBuilder> target_builders_;
|
||||
Context const *ctx_;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Build the histogram for root node.
|
||||
*/
|
||||
template <typename Partitioner, typename ExpandEntry>
|
||||
void BuildRootHist(DMatrix *p_fmat, RegTree const *p_tree,
|
||||
std::vector<Partitioner> const &partitioners,
|
||||
linalg::MatrixView<GradientPair const> gpair, ExpandEntry const &best,
|
||||
BatchParam const ¶m, bool force_read_by_column = false) {
|
||||
auto n_targets = p_tree->NumTargets();
|
||||
CHECK_EQ(gpair.Shape(1), n_targets);
|
||||
CHECK_EQ(p_fmat->Info().num_row_, gpair.Shape(0));
|
||||
CHECK_EQ(target_builders_.size(), n_targets);
|
||||
std::vector<bst_node_t> nodes{best.nid};
|
||||
std::vector<bst_node_t> dummy_sub;
|
||||
|
||||
auto space = ConstructHistSpace(partitioners, nodes);
|
||||
for (bst_target_t t{0}; t < n_targets; ++t) {
|
||||
this->target_builders_[t].AddHistRows(p_tree, &nodes, &dummy_sub, false);
|
||||
}
|
||||
CHECK(dummy_sub.empty());
|
||||
|
||||
std::size_t page_idx{0};
|
||||
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
|
||||
for (bst_target_t t{0}; t < n_targets; ++t) {
|
||||
auto t_gpair = gpair.Slice(linalg::All(), t);
|
||||
this->target_builders_[t].BuildHist(page_idx, space, gidx,
|
||||
partitioners[page_idx].Partitions(), nodes, t_gpair,
|
||||
force_read_by_column);
|
||||
}
|
||||
++page_idx;
|
||||
}
|
||||
|
||||
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
|
||||
this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Build histogram for left and right child of valid candidates
|
||||
*/
|
||||
template <typename Partitioner, typename ExpandEntry>
|
||||
void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
|
||||
std::vector<Partitioner> const &partitioners,
|
||||
std::vector<ExpandEntry> const &valid_candidates,
|
||||
linalg::MatrixView<GradientPair const> gpair, BatchParam const ¶m,
|
||||
bool force_read_by_column = false) {
|
||||
std::vector<bst_node_t> nodes_to_build(valid_candidates.size());
|
||||
std::vector<bst_node_t> nodes_to_sub(valid_candidates.size());
|
||||
AssignNodes(p_tree, valid_candidates, nodes_to_build, nodes_to_sub);
|
||||
|
||||
// use the first builder for getting number of valid nodes.
|
||||
target_builders_.front().AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, true);
|
||||
CHECK_GE(nodes_to_build.size(), nodes_to_sub.size());
|
||||
CHECK_EQ(nodes_to_sub.size() + nodes_to_build.size(), valid_candidates.size() * 2);
|
||||
|
||||
// allocate storage for the rest of the builders
|
||||
for (bst_target_t t = 1; t < target_builders_.size(); ++t) {
|
||||
target_builders_[t].AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, false);
|
||||
}
|
||||
|
||||
auto space = ConstructHistSpace(partitioners, nodes_to_build);
|
||||
std::size_t page_idx{0};
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
|
||||
CHECK_EQ(gpair.Shape(1), p_tree->NumTargets());
|
||||
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
|
||||
auto t_gpair = gpair.Slice(linalg::All(), t);
|
||||
CHECK_EQ(t_gpair.Shape(0), p_fmat->Info().num_row_);
|
||||
this->target_builders_[t].BuildHist(page_idx, space, page,
|
||||
partitioners[page_idx].Partitions(), nodes_to_build,
|
||||
t_gpair, force_read_by_column);
|
||||
}
|
||||
page_idx++;
|
||||
}
|
||||
|
||||
for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
|
||||
this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] auto const &Histogram(bst_target_t t) const {
|
||||
return target_builders_[t].Histogram();
|
||||
}
|
||||
[[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); }
|
||||
|
||||
void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p,
|
||||
bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) {
|
||||
ctx_ = ctx;
|
||||
target_builders_.resize(n_targets);
|
||||
CHECK_GE(n_targets, 1);
|
||||
for (auto &v : target_builders_) {
|
||||
v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param);
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::tree
|
||||
#endif // XGBOOST_TREE_HIST_HISTOGRAM_H_
|
||||
|
||||
34
src/tree/hist/param.cc
Normal file
34
src/tree/hist/param.cc
Normal file
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
*/
|
||||
#include "param.h"
|
||||
|
||||
#include <string> // for string
|
||||
|
||||
#include "../../collective/communicator-inl.h" // for GetRank, Broadcast
|
||||
#include "xgboost/json.h" // for Object, Json
|
||||
#include "xgboost/tree_model.h" // for RegTree
|
||||
|
||||
namespace xgboost::tree {
|
||||
DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
|
||||
|
||||
void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
|
||||
if (!this->debug_synchronize) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string s_model;
|
||||
Json model{Object{}};
|
||||
int rank = collective::GetRank();
|
||||
if (rank == 0) {
|
||||
local_tree->SaveModel(&model);
|
||||
}
|
||||
Json::Dump(model, &s_model, std::ios::binary);
|
||||
collective::Broadcast(&s_model, 0);
|
||||
|
||||
RegTree ref_tree{}; // rank 0 tree
|
||||
auto j_ref_tree = Json::Load(StringView{s_model}, std::ios::binary);
|
||||
ref_tree.LoadModel(j_ref_tree);
|
||||
CHECK(*local_tree == ref_tree);
|
||||
}
|
||||
} // namespace xgboost::tree
|
||||
31
src/tree/hist/param.h
Normal file
31
src/tree/hist/param.h
Normal file
@@ -0,0 +1,31 @@
|
||||
/**
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
|
||||
#include "xgboost/parameter.h" // for XGBoostParameter
|
||||
#include "xgboost/tree_model.h" // for RegTree
|
||||
|
||||
namespace xgboost::tree {
|
||||
struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
|
||||
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
|
||||
|
||||
bool debug_synchronize{false};
|
||||
std::size_t max_cached_hist_node{DefaultNodes()};
|
||||
|
||||
void CheckTreesSynchronized(RegTree const* local_tree) const;
|
||||
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
|
||||
DMLC_DECLARE_FIELD(debug_synchronize)
|
||||
.set_default(false)
|
||||
.describe("Check if all distributed tree are identical after tree construction.");
|
||||
DMLC_DECLARE_FIELD(max_cached_hist_node)
|
||||
.set_default(DefaultNodes())
|
||||
.set_lower_bound(1)
|
||||
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::tree
|
||||
@@ -526,7 +526,7 @@ struct SplitEntryContainer {
|
||||
* \return whether the proposed split is better and can replace current split
|
||||
*/
|
||||
template <typename GradientSumT>
|
||||
bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
|
||||
bool Update(bst_float new_loss_chg, bst_feature_t split_index, float new_split_value,
|
||||
bool default_left, bool is_cat, GradientSumT const &left_sum,
|
||||
GradientSumT const &right_sum) {
|
||||
if (this->NeedReplace(new_loss_chg, split_index)) {
|
||||
|
||||
@@ -213,7 +213,7 @@ std::vector<bst_cat_t> GetSplitCategories(RegTree const &tree, int32_t nidx) {
|
||||
auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};
|
||||
|
||||
std::vector<bst_cat_t> cats;
|
||||
for (size_t i = 0; i < split.Size(); ++i) {
|
||||
for (size_t i = 0; i < split.Capacity(); ++i) {
|
||||
if (split.Check(i)) {
|
||||
cats.push_back(static_cast<bst_cat_t>(i));
|
||||
}
|
||||
@@ -398,11 +398,14 @@ class JsonGenerator : public TreeGenerator {
|
||||
static std::string const kIndicatorTemplate =
|
||||
R"ID( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", "yes": {yes}, "no": {no})ID";
|
||||
auto split_index = tree[nid].SplitIndex();
|
||||
auto fname = fmap_.Name(split_index);
|
||||
std::string qfname; // quoted
|
||||
common::EscapeU8(fname, &qfname);
|
||||
auto result = SuperT::Match(
|
||||
kIndicatorTemplate,
|
||||
{{"{nid}", std::to_string(nid)},
|
||||
{"{depth}", std::to_string(depth)},
|
||||
{"{fname}", fmap_.Name(split_index)},
|
||||
{"{fname}", qfname},
|
||||
{"{yes}", std::to_string(nyes)},
|
||||
{"{no}", std::to_string(tree[nid].DefaultChild())}});
|
||||
return result;
|
||||
@@ -430,12 +433,14 @@ class JsonGenerator : public TreeGenerator {
|
||||
std::string const &template_str, std::string cond,
|
||||
uint32_t depth) const {
|
||||
auto split_index = tree[nid].SplitIndex();
|
||||
auto fname = split_index < fmap_.Size() ? fmap_.Name(split_index) : std::to_string(split_index);
|
||||
std::string qfname; // quoted
|
||||
common::EscapeU8(fname, &qfname);
|
||||
std::string const result = SuperT::Match(
|
||||
template_str,
|
||||
{{"{nid}", std::to_string(nid)},
|
||||
{"{depth}", std::to_string(depth)},
|
||||
{"{fname}", split_index < fmap_.Size() ? fmap_.Name(split_index) :
|
||||
std::to_string(split_index)},
|
||||
{"{fname}", qfname},
|
||||
{"{cond}", cond},
|
||||
{"{left}", std::to_string(tree[nid].LeftChild())},
|
||||
{"{right}", std::to_string(tree[nid].RightChild())},
|
||||
@@ -1004,7 +1009,7 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
|
||||
auto segment = split_categories_segments_[i];
|
||||
auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
|
||||
common::KCatBitField const cat_bits(node_categories);
|
||||
for (size_t i = 0; i < cat_bits.Size(); ++i) {
|
||||
for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
|
||||
if (cat_bits.Check(i)) {
|
||||
categories.GetArray().emplace_back(i);
|
||||
}
|
||||
|
||||
@@ -3,27 +3,39 @@
|
||||
*
|
||||
* \brief Implementation for the approx tree method.
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <algorithm> // for max, transform, fill_n
|
||||
#include <cstddef> // for size_t
|
||||
#include <map> // for map
|
||||
#include <memory> // for allocator, unique_ptr, make_shared, make_unique
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../collective/aggregator.h"
|
||||
#include "../common/random.h"
|
||||
#include "../data/gradient_index.h"
|
||||
#include "common_row_partitioner.h"
|
||||
#include "constraints.h"
|
||||
#include "driver.h"
|
||||
#include "hist/evaluate_splits.h"
|
||||
#include "hist/histogram.h"
|
||||
#include "hist/sampler.h" // for SampleGradient
|
||||
#include "param.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/json.h"
|
||||
#include "xgboost/linalg.h"
|
||||
#include "xgboost/task.h" // for ObjInfo
|
||||
#include "xgboost/tree_model.h"
|
||||
#include "xgboost/tree_updater.h" // for TreeUpdater
|
||||
#include "../collective/aggregator.h" // for GlobalSum
|
||||
#include "../collective/communicator-inl.h" // for IsDistributed
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "../common/random.h" // for ColumnSampler
|
||||
#include "../common/timer.h" // for Monitor
|
||||
#include "../data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "common_row_partitioner.h" // for CommonRowPartitioner
|
||||
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
|
||||
#include "driver.h" // for Driver
|
||||
#include "hist/evaluate_splits.h" // for HistEvaluator, UpdatePredictionCacheImpl
|
||||
#include "hist/expand_entry.h" // for CPUExpandEntry
|
||||
#include "hist/histogram.h" // for MultiHistogramBuilder
|
||||
#include "hist/param.h" // for HistMakerTrainParam
|
||||
#include "hist/sampler.h" // for SampleGradient
|
||||
#include "param.h" // for GradStats, TrainParam
|
||||
#include "xgboost/base.h" // for Args, GradientPair, bst_node_t, bst_bin_t
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for DMatrix, BatchSet, BatchIterator, MetaInfo
|
||||
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
||||
#include "xgboost/json.h" // for Object, Json, FromJson, ToJson, get
|
||||
#include "xgboost/linalg.h" // for Matrix, MakeTensorView, Empty, MatrixView
|
||||
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK
|
||||
#include "xgboost/span.h" // for Span
|
||||
#include "xgboost/task.h" // for ObjInfo
|
||||
#include "xgboost/tree_model.h" // for RegTree, RTreeNodeStat
|
||||
#include "xgboost/tree_updater.h" // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
|
||||
|
||||
namespace xgboost::tree {
|
||||
|
||||
@@ -43,9 +55,10 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
|
||||
class GloablApproxBuilder {
|
||||
protected:
|
||||
TrainParam const *param_;
|
||||
HistMakerTrainParam const *hist_param_{nullptr};
|
||||
std::shared_ptr<common::ColumnSampler> col_sampler_;
|
||||
HistEvaluator evaluator_;
|
||||
HistogramBuilder<CPUExpandEntry> histogram_builder_;
|
||||
MultiHistogramBuilder histogram_builder_;
|
||||
Context const *ctx_;
|
||||
ObjInfo const *const task_;
|
||||
|
||||
@@ -58,7 +71,7 @@ class GloablApproxBuilder {
|
||||
common::HistogramCuts feature_values_;
|
||||
|
||||
public:
|
||||
void InitData(DMatrix *p_fmat, common::Span<float> hess) {
|
||||
void InitData(DMatrix *p_fmat, RegTree const *p_tree, common::Span<float> hess) {
|
||||
monitor_->Start(__func__);
|
||||
|
||||
n_batches_ = 0;
|
||||
@@ -78,8 +91,9 @@ class GloablApproxBuilder {
|
||||
n_batches_++;
|
||||
}
|
||||
|
||||
histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
|
||||
collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
|
||||
histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess),
|
||||
collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
|
||||
hist_param_);
|
||||
monitor_->Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -95,20 +109,16 @@ class GloablApproxBuilder {
|
||||
}
|
||||
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
|
||||
std::vector<CPUExpandEntry> nodes{best};
|
||||
size_t i = 0;
|
||||
auto space = ConstructHistSpace(partitioner_, nodes);
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
|
||||
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
|
||||
{}, gpair);
|
||||
i++;
|
||||
}
|
||||
this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
|
||||
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
|
||||
best, BatchSpec(*param_, hess));
|
||||
|
||||
auto weight = evaluator_.InitRoot(root_sum);
|
||||
p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
|
||||
p_tree->Stat(RegTree::kRoot).base_weight = weight;
|
||||
(*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
|
||||
|
||||
auto const &histograms = histogram_builder_.Histogram();
|
||||
auto const &histograms = histogram_builder_.Histogram(0);
|
||||
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
|
||||
evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &nodes);
|
||||
monitor_->Stop(__func__);
|
||||
@@ -129,30 +139,9 @@ class GloablApproxBuilder {
|
||||
std::vector<CPUExpandEntry> const &valid_candidates,
|
||||
std::vector<GradientPair> const &gpair, common::Span<float> hess) {
|
||||
monitor_->Start(__func__);
|
||||
std::vector<CPUExpandEntry> nodes_to_build;
|
||||
std::vector<CPUExpandEntry> nodes_to_sub;
|
||||
|
||||
for (auto const &c : valid_candidates) {
|
||||
auto left_nidx = (*p_tree)[c.nid].LeftChild();
|
||||
auto right_nidx = (*p_tree)[c.nid].RightChild();
|
||||
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
|
||||
|
||||
auto build_nidx = left_nidx;
|
||||
auto subtract_nidx = right_nidx;
|
||||
if (fewer_right) {
|
||||
std::swap(build_nidx, subtract_nidx);
|
||||
}
|
||||
nodes_to_build.push_back(CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}});
|
||||
nodes_to_sub.push_back(CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}});
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
|
||||
histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
|
||||
nodes_to_build, nodes_to_sub, gpair);
|
||||
i++;
|
||||
}
|
||||
this->histogram_builder_.BuildHistLeftRight(
|
||||
p_fmat, p_tree, partitioner_, valid_candidates,
|
||||
linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
|
||||
monitor_->Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -169,10 +158,12 @@ class GloablApproxBuilder {
|
||||
}
|
||||
|
||||
public:
|
||||
explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
|
||||
explicit GloablApproxBuilder(TrainParam const *param, HistMakerTrainParam const *hist_param,
|
||||
MetaInfo const &info, Context const *ctx,
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
ObjInfo const *task, common::Monitor *monitor)
|
||||
: param_{param},
|
||||
hist_param_{hist_param},
|
||||
col_sampler_{std::move(column_sampler)},
|
||||
evaluator_{ctx, param_, info, col_sampler_},
|
||||
ctx_{ctx},
|
||||
@@ -182,7 +173,7 @@ class GloablApproxBuilder {
|
||||
void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
|
||||
RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
|
||||
p_last_tree_ = p_tree;
|
||||
this->InitData(p_fmat, hess);
|
||||
this->InitData(p_fmat, p_tree, hess);
|
||||
|
||||
Driver<CPUExpandEntry> driver(*param_);
|
||||
auto &tree = *p_tree;
|
||||
@@ -232,7 +223,7 @@ class GloablApproxBuilder {
|
||||
best_splits.push_back(l_best);
|
||||
best_splits.push_back(r_best);
|
||||
}
|
||||
auto const &histograms = histogram_builder_.Histogram();
|
||||
auto const &histograms = histogram_builder_.Histogram(0);
|
||||
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
|
||||
monitor_->Start("EvaluateSplits");
|
||||
evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &best_splits);
|
||||
@@ -260,6 +251,7 @@ class GlobalApproxUpdater : public TreeUpdater {
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_ =
|
||||
std::make_shared<common::ColumnSampler>();
|
||||
ObjInfo const *task_;
|
||||
HistMakerTrainParam hist_param_;
|
||||
|
||||
public:
|
||||
explicit GlobalApproxUpdater(Context const *ctx, ObjInfo const *task)
|
||||
@@ -267,25 +259,33 @@ class GlobalApproxUpdater : public TreeUpdater {
|
||||
monitor_.Init(__func__);
|
||||
}
|
||||
|
||||
void Configure(Args const &) override {}
|
||||
void LoadConfig(Json const &) override {}
|
||||
void SaveConfig(Json *) const override {}
|
||||
void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
|
||||
void LoadConfig(Json const &in) override {
|
||||
auto const &config = get<Object const>(in);
|
||||
FromJson(config.at("hist_train_param"), &hist_param_);
|
||||
}
|
||||
void SaveConfig(Json *p_out) const override {
|
||||
auto &out = *p_out;
|
||||
out["hist_train_param"] = ToJson(hist_param_);
|
||||
}
|
||||
|
||||
void InitData(TrainParam const ¶m, HostDeviceVector<GradientPair> const *gpair,
|
||||
void InitData(TrainParam const ¶m, linalg::Matrix<GradientPair> const *gpair,
|
||||
linalg::Matrix<GradientPair> *sampled) {
|
||||
*sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
|
||||
sampled->Data()->Copy(*gpair);
|
||||
auto in = gpair->HostView().Values();
|
||||
std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
|
||||
|
||||
SampleGradient(ctx_, param, sampled->HostView());
|
||||
}
|
||||
|
||||
[[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
|
||||
|
||||
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
|
||||
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *m,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree *> &trees) override {
|
||||
pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
|
||||
&monitor_);
|
||||
CHECK(hist_param_.GetInitialised());
|
||||
pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
|
||||
column_sampler_, task_, &monitor_);
|
||||
|
||||
linalg::Matrix<GradientPair> h_gpair;
|
||||
// Obtain the hessian values for weighted sketching
|
||||
@@ -300,6 +300,7 @@ class GlobalApproxUpdater : public TreeUpdater {
|
||||
std::size_t t_idx = 0;
|
||||
for (auto p_tree : trees) {
|
||||
this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
|
||||
hist_param_.CheckTreesSynchronized(p_tree);
|
||||
++t_idx;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,7 +91,7 @@ class ColMaker: public TreeUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
|
||||
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
|
||||
const std::vector<RegTree *> &trees) override {
|
||||
if (collective::IsDistributed()) {
|
||||
@@ -106,10 +106,11 @@ class ColMaker: public TreeUpdater {
|
||||
// rescale learning rate according to size of trees
|
||||
interaction_constraints_.Configure(*param, dmat->Info().num_row_);
|
||||
// build tree
|
||||
CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
|
||||
for (auto tree : trees) {
|
||||
CHECK(ctx_);
|
||||
Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
|
||||
builder.Update(gpair->ConstHostVector(), dmat, tree);
|
||||
builder.Update(gpair->Data()->ConstHostVector(), dmat, tree);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -72,7 +72,6 @@ struct DeviceSplitCandidate {
|
||||
// split.
|
||||
bst_cat_t thresh{-1};
|
||||
|
||||
common::CatBitField split_cats;
|
||||
bool is_cat { false };
|
||||
|
||||
GradientPairInt64 left_sum;
|
||||
@@ -80,12 +79,6 @@ struct DeviceSplitCandidate {
|
||||
|
||||
XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT
|
||||
|
||||
template <typename T>
|
||||
XGBOOST_DEVICE void SetCat(T c) {
|
||||
this->split_cats.Set(common::AsCat(c));
|
||||
fvalue = std::max(this->fvalue, static_cast<float>(c));
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
|
||||
int findex_in, GradientPairInt64 left_sum_in,
|
||||
GradientPairInt64 right_sum_in, bool cat,
|
||||
@@ -108,22 +101,23 @@ struct DeviceSplitCandidate {
|
||||
*/
|
||||
XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
|
||||
bst_feature_t findex_in, GradientPairInt64 left_sum_in,
|
||||
GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) {
|
||||
if (loss_chg_in > loss_chg &&
|
||||
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
|
||||
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
|
||||
loss_chg = loss_chg_in;
|
||||
dir = dir_in;
|
||||
fvalue = std::numeric_limits<float>::quiet_NaN();
|
||||
thresh = thresh_in;
|
||||
is_cat = true;
|
||||
left_sum = left_sum_in;
|
||||
right_sum = right_sum_in;
|
||||
findex = findex_in;
|
||||
}
|
||||
GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
|
||||
const GradientQuantiser& quantiser) {
|
||||
if (loss_chg_in > loss_chg &&
|
||||
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
|
||||
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
|
||||
loss_chg = loss_chg_in;
|
||||
dir = dir_in;
|
||||
fvalue = std::numeric_limits<float>::quiet_NaN();
|
||||
thresh = thresh_in;
|
||||
is_cat = true;
|
||||
left_sum = left_sum_in;
|
||||
right_sum = right_sum_in;
|
||||
findex = findex_in;
|
||||
}
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
|
||||
[[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
|
||||
os << "loss_chg:" << c.loss_chg << ", "
|
||||
|
||||
@@ -7,12 +7,13 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <cstddef> // for size_t
|
||||
#include <memory> // for unique_ptr, make_unique
|
||||
#include <utility> // for move
|
||||
#include <vector>
|
||||
|
||||
#include "../collective/communicator-inl.cuh"
|
||||
#include "../collective/aggregator.h"
|
||||
#include "../collective/aggregator.cuh"
|
||||
#include "../common/bitfield.h"
|
||||
#include "../common/categorical.h"
|
||||
|
||||
@@ -22,6 +23,7 @@
|
||||
#include "../common/io.h"
|
||||
#include "../common/timer.h"
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "../data/ellpack_page.h"
|
||||
#include "constraints.cuh"
|
||||
#include "driver.h"
|
||||
#include "gpu_hist/evaluate_splits.cuh"
|
||||
@@ -30,8 +32,8 @@
|
||||
#include "gpu_hist/gradient_based_sampler.cuh"
|
||||
#include "gpu_hist/histogram.cuh"
|
||||
#include "gpu_hist/row_partitioner.cuh"
|
||||
#include "hist/param.h"
|
||||
#include "param.h"
|
||||
#include "split_evaluator.h"
|
||||
#include "updater_gpu_common.cuh"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/context.h"
|
||||
@@ -48,20 +50,6 @@ namespace xgboost::tree {
|
||||
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
#endif // !defined(GTEST_TEST)
|
||||
|
||||
// training parameters specific to this algorithm
|
||||
struct GPUHistMakerTrainParam
|
||||
: public XGBoostParameter<GPUHistMakerTrainParam> {
|
||||
bool debug_synchronize;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
|
||||
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
|
||||
"Check if all distributed tree are identical after tree construction.");
|
||||
}
|
||||
};
|
||||
#if !defined(GTEST_TEST)
|
||||
DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
|
||||
#endif // !defined(GTEST_TEST)
|
||||
|
||||
/**
|
||||
* \struct DeviceHistogramStorage
|
||||
*
|
||||
@@ -170,16 +158,16 @@ class DeviceHistogramStorage {
|
||||
};
|
||||
|
||||
// Manage memory for a single GPU
|
||||
template <typename GradientSumT>
|
||||
struct GPUHistMakerDevice {
|
||||
private:
|
||||
GPUHistEvaluator evaluator_;
|
||||
Context const* ctx_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
MetaInfo const& info_;
|
||||
|
||||
public:
|
||||
EllpackPageImpl const* page;
|
||||
EllpackPageImpl const* page{nullptr};
|
||||
common::Span<FeatureType const> feature_types;
|
||||
BatchParam batch_param;
|
||||
|
||||
std::unique_ptr<RowPartitioner> row_partitioner;
|
||||
DeviceHistogramStorage<> hist{};
|
||||
@@ -199,98 +187,95 @@ struct GPUHistMakerDevice {
|
||||
dh::PinnedMemory pinned2;
|
||||
|
||||
common::Monitor monitor;
|
||||
common::ColumnSampler column_sampler;
|
||||
FeatureInteractionConstraintDevice interaction_constraints;
|
||||
|
||||
std::unique_ptr<GradientBasedSampler> sampler;
|
||||
|
||||
std::unique_ptr<FeatureGroups> feature_groups;
|
||||
|
||||
|
||||
GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
|
||||
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
|
||||
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
|
||||
BatchParam _batch_param)
|
||||
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
|
||||
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
|
||||
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
|
||||
: evaluator_{_param, n_features, ctx->gpu_id},
|
||||
ctx_(ctx),
|
||||
page(_page),
|
||||
feature_types{_feature_types},
|
||||
param(std::move(_param)),
|
||||
column_sampler(column_sampler_seed),
|
||||
column_sampler_(std::move(column_sampler)),
|
||||
interaction_constraints(param, n_features),
|
||||
batch_param(std::move(_batch_param)) {
|
||||
sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method));
|
||||
info_{info} {
|
||||
sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
|
||||
param.sampling_method, is_external_memory);
|
||||
if (!param.monotone_constraints.empty()) {
|
||||
// Copy assigning an empty vector causes an exception in MSVC debug builds
|
||||
monotone_constraints = param.monotone_constraints;
|
||||
}
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
|
||||
CHECK(column_sampler_);
|
||||
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
|
||||
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
||||
sizeof(GradientSumT)));
|
||||
}
|
||||
|
||||
~GPUHistMakerDevice() { // NOLINT
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
~GPUHistMakerDevice() = default;
|
||||
|
||||
void InitFeatureGroupsOnce() {
|
||||
if (!feature_groups) {
|
||||
CHECK(page);
|
||||
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
|
||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
||||
sizeof(GradientPairPrecise));
|
||||
}
|
||||
}
|
||||
|
||||
// Reset values for each update iteration
|
||||
// Note that the column sampler must be passed by value because it is not
|
||||
// thread safe
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
|
||||
auto const& info = dmat->Info();
|
||||
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||
param.colsample_bynode, param.colsample_bylevel,
|
||||
param.colsample_bytree);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
#endif
|
||||
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
||||
ctx_->gpu_id);
|
||||
|
||||
this->interaction_constraints.Reset();
|
||||
|
||||
if (d_gpair.size() != dh_gpair->Size()) {
|
||||
d_gpair.resize(dh_gpair->Size());
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair), hipMemcpyDeviceToDevice));
|
||||
dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
|
||||
dh_gpair->Size() * sizeof(GradientPair),
|
||||
hipMemcpyDeviceToDevice));
|
||||
#endif
|
||||
|
||||
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
|
||||
page = sample.page;
|
||||
gpair = sample.gpair;
|
||||
|
||||
quantiser.reset(new GradientQuantiser(this->gpair));
|
||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
||||
dmat->Info().IsColumnSplit(), ctx_->gpu_id);
|
||||
|
||||
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
|
||||
|
||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
|
||||
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
|
||||
hist.Reset();
|
||||
|
||||
this->InitFeatureGroupsOnce();
|
||||
}
|
||||
|
||||
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
|
||||
int nidx = RegTree::kRoot;
|
||||
GPUTrainingParam gpu_param(param);
|
||||
auto sampled_features = column_sampler.GetFeatureSet(0);
|
||||
sampled_features->SetDevice(ctx_->gpu_id);
|
||||
auto sampled_features = column_sampler_->GetFeatureSet(0);
|
||||
sampled_features->SetDevice(ctx_->Device());
|
||||
common::Span<bst_feature_t> feature_set =
|
||||
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
@@ -324,19 +309,19 @@ struct GPUHistMakerDevice {
|
||||
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
|
||||
// Store the feature set ptrs so they dont go out of scope before the kernel is called
|
||||
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
|
||||
for (size_t i = 0; i < candidates.size(); i++) {
|
||||
for (std::size_t i = 0; i < candidates.size(); i++) {
|
||||
auto candidate = candidates.at(i);
|
||||
int left_nidx = tree[candidate.nid].LeftChild();
|
||||
int right_nidx = tree[candidate.nid].RightChild();
|
||||
nidx[i * 2] = left_nidx;
|
||||
nidx[i * 2 + 1] = right_nidx;
|
||||
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
|
||||
left_sampled_features->SetDevice(ctx_->gpu_id);
|
||||
auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
|
||||
left_sampled_features->SetDevice(ctx_->Device());
|
||||
feature_sets.emplace_back(left_sampled_features);
|
||||
common::Span<bst_feature_t> left_feature_set =
|
||||
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
|
||||
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||
right_sampled_features->SetDevice(ctx_->gpu_id);
|
||||
auto right_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(right_nidx));
|
||||
right_sampled_features->SetDevice(ctx_->Device());
|
||||
feature_sets.emplace_back(right_sampled_features);
|
||||
common::Span<bst_feature_t> right_feature_set =
|
||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
|
||||
@@ -363,10 +348,8 @@ struct GPUHistMakerDevice {
|
||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
|
||||
#endif
|
||||
|
||||
this->evaluator_.EvaluateSplits(nidx, max_active_features,
|
||||
dh::ToSpan(d_node_inputs), shared_inputs,
|
||||
dh::ToSpan(entries));
|
||||
|
||||
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
|
||||
shared_inputs, dh::ToSpan(entries));
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||
@@ -378,7 +361,7 @@ struct GPUHistMakerDevice {
|
||||
#endif
|
||||
|
||||
dh::DefaultStream().Sync();
|
||||
}
|
||||
}
|
||||
|
||||
void BuildHist(int nidx) {
|
||||
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
||||
@@ -410,31 +393,108 @@ struct GPUHistMakerDevice {
|
||||
struct NodeSplitData {
|
||||
RegTree::Node split_node;
|
||||
FeatureType split_type;
|
||||
common::CatBitField node_cats;
|
||||
common::KCatBitField node_cats;
|
||||
};
|
||||
|
||||
void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
|
||||
if (candidates.empty()) return;
|
||||
std::vector<int> nidx(candidates.size());
|
||||
std::vector<int> left_nidx(candidates.size());
|
||||
std::vector<int> right_nidx(candidates.size());
|
||||
void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
|
||||
std::vector<NodeSplitData> const& split_data,
|
||||
std::vector<bst_node_t> const& nidx,
|
||||
std::vector<bst_node_t> const& left_nidx,
|
||||
std::vector<bst_node_t> const& right_nidx) {
|
||||
auto const num_candidates = split_data.size();
|
||||
|
||||
using BitVector = LBitField64;
|
||||
using BitType = BitVector::value_type;
|
||||
auto const size = BitVector::ComputeStorageSize(d_matrix.n_rows * num_candidates);
|
||||
dh::TemporaryArray<BitType> decision_storage(size, 0);
|
||||
dh::TemporaryArray<BitType> missing_storage(size, 0);
|
||||
BitVector decision_bits{dh::ToSpan(decision_storage)};
|
||||
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
||||
|
||||
dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
|
||||
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
|
||||
num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
|
||||
#endif
|
||||
auto d_split_data = dh::ToSpan(split_data_storage);
|
||||
|
||||
dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
|
||||
for (auto i = 0; i < num_candidates; i++) {
|
||||
auto const& data = d_split_data[i];
|
||||
auto const cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
|
||||
if (isnan(cut_value)) {
|
||||
missing_bits.Set(ridx * num_candidates + i);
|
||||
} else {
|
||||
bool go_left;
|
||||
if (data.split_type == FeatureType::kCategorical) {
|
||||
go_left = common::Decision(data.node_cats.Bits(), cut_value);
|
||||
} else {
|
||||
go_left = cut_value <= data.split_node.SplitCond();
|
||||
}
|
||||
if (go_left) {
|
||||
decision_bits.Set(ridx * num_candidates + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||
ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
|
||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||
ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
|
||||
collective::Synchronize(ctx_->gpu_id);
|
||||
|
||||
row_partitioner->UpdatePositionBatch(
|
||||
nidx, left_nidx, right_nidx, split_data,
|
||||
[=] __device__(bst_uint ridx, int split_index, NodeSplitData const& data) {
|
||||
auto const index = ridx * num_candidates + split_index;
|
||||
bool go_left;
|
||||
if (missing_bits.Check(index)) {
|
||||
go_left = data.split_node.DefaultLeft();
|
||||
} else {
|
||||
go_left = decision_bits.Check(index);
|
||||
}
|
||||
return go_left;
|
||||
});
|
||||
}
|
||||
|
||||
void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
|
||||
if (candidates.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<bst_node_t> nidx(candidates.size());
|
||||
std::vector<bst_node_t> left_nidx(candidates.size());
|
||||
std::vector<bst_node_t> right_nidx(candidates.size());
|
||||
std::vector<NodeSplitData> split_data(candidates.size());
|
||||
|
||||
for (size_t i = 0; i < candidates.size(); i++) {
|
||||
auto& e = candidates[i];
|
||||
auto const& e = candidates[i];
|
||||
RegTree::Node split_node = (*p_tree)[e.nid];
|
||||
auto split_type = p_tree->NodeSplitType(e.nid);
|
||||
nidx.at(i) = e.nid;
|
||||
left_nidx.at(i) = split_node.LeftChild();
|
||||
right_nidx.at(i) = split_node.RightChild();
|
||||
split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
|
||||
split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
|
||||
|
||||
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
|
||||
}
|
||||
|
||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||
|
||||
if (info_.IsColumnSplit()) {
|
||||
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
|
||||
return;
|
||||
}
|
||||
|
||||
row_partitioner->UpdatePositionBatch(
|
||||
nidx, left_nidx, right_nidx, split_data,
|
||||
[=] __device__(bst_uint ridx, const NodeSplitData& data) {
|
||||
[=] __device__(bst_uint ridx, int split_index, const NodeSplitData& data) {
|
||||
// given a row index, returns the node id it belongs to
|
||||
bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
|
||||
float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
|
||||
// Missing value
|
||||
bool go_left = true;
|
||||
if (isnan(cut_value)) {
|
||||
@@ -569,14 +629,14 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
CHECK(p_tree);
|
||||
CHECK(out_preds_d.Device().IsCUDA());
|
||||
CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||
dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
|
||||
#endif
|
||||
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
|
||||
|
||||
auto d_position = dh::ToSpan(positions);
|
||||
CHECK_EQ(out_preds_d.Size(), d_position.size());
|
||||
|
||||
@@ -609,9 +669,8 @@ struct GPUHistMakerDevice {
|
||||
monitor.Start("AllReduce");
|
||||
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
||||
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
|
||||
collective::AllReduce<collective::Operation::kSum>(
|
||||
ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
page->Cuts().TotalBins() * 2 * num_histograms);
|
||||
collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
|
||||
page->Cuts().TotalBins() * 2 * num_histograms);
|
||||
|
||||
monitor.Stop("AllReduce");
|
||||
}
|
||||
@@ -692,7 +751,6 @@ struct GPUHistMakerDevice {
|
||||
CHECK(common::CheckNAN(candidate.split.fvalue));
|
||||
std::vector<common::CatBitField::value_type> split_cats;
|
||||
|
||||
CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
|
||||
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
|
||||
auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
|
||||
split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
|
||||
@@ -713,7 +771,6 @@ struct GPUHistMakerDevice {
|
||||
evaluator_.ApplyTreeSplit(candidate, p_tree);
|
||||
|
||||
const auto& parent = tree[candidate.nid];
|
||||
std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
|
||||
interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
|
||||
parent.RightChild());
|
||||
}
|
||||
@@ -730,8 +787,7 @@ struct GPUHistMakerDevice {
|
||||
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
|
||||
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
|
||||
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
|
||||
collective::Allreduce<collective::Operation::kSum>(
|
||||
reinterpret_cast<ReduceT *>(&root_sum_quantised), 2);
|
||||
collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
|
||||
|
||||
hist.AllocateHistograms({kRootNIdx});
|
||||
this->BuildHist(kRootNIdx);
|
||||
@@ -749,9 +805,8 @@ struct GPUHistMakerDevice {
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
|
||||
ObjInfo const* task, RegTree* p_tree,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
|
||||
RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
auto& tree = *p_tree;
|
||||
// Process maximum 32 nodes at a time
|
||||
Driver<GPUExpandEntry> driver(param, 32);
|
||||
@@ -776,7 +831,6 @@ struct GPUHistMakerDevice {
|
||||
std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
|
||||
[&](const auto& e) { return driver.IsChildValid(e); });
|
||||
|
||||
|
||||
auto new_candidates =
|
||||
pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
|
||||
|
||||
@@ -809,8 +863,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
using GradientSumT = GradientPairPrecise;
|
||||
|
||||
public:
|
||||
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task)
|
||||
: TreeUpdater(ctx), task_{task} {};
|
||||
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
|
||||
void Configure(const Args& args) override {
|
||||
// Used in test to count how many configurations are performed
|
||||
LOG(DEBUG) << "[GPU Hist]: Configure";
|
||||
@@ -823,32 +876,31 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
void LoadConfig(Json const& in) override {
|
||||
auto const& config = get<Object const>(in);
|
||||
FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
|
||||
FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
|
||||
initialised_ = false;
|
||||
}
|
||||
void SaveConfig(Json* p_out) const override {
|
||||
auto& out = *p_out;
|
||||
out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
|
||||
out["hist_train_param"] = ToJson(hist_maker_param_);
|
||||
}
|
||||
|
||||
~GPUHistMaker() { // NOLINT
|
||||
dh::GlobalMemoryLogger().Log();
|
||||
}
|
||||
|
||||
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
monitor_.Start("Update");
|
||||
|
||||
CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
|
||||
auto gpair_hdv = gpair->Data();
|
||||
// build tree
|
||||
try {
|
||||
size_t t_idx{0};
|
||||
std::size_t t_idx{0};
|
||||
for (xgboost::RegTree* tree : trees) {
|
||||
this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
|
||||
|
||||
if (hist_maker_param_.debug_synchronize) {
|
||||
this->CheckTreesSynchronized(tree);
|
||||
}
|
||||
this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
|
||||
this->hist_maker_param_.CheckTreesSynchronized(tree);
|
||||
++t_idx;
|
||||
}
|
||||
|
||||
@@ -870,9 +922,9 @@ class GPUHistMaker : public TreeUpdater {
|
||||
// Synchronise the column sampling seed
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||
auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
@@ -880,9 +932,9 @@ class GPUHistMaker : public TreeUpdater {
|
||||
#endif
|
||||
|
||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
||||
maker.reset(new GPUHistMakerDevice<GradientSumT>(
|
||||
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
|
||||
column_sampling_seed, info_->num_col_, batch_param));
|
||||
maker = std::make_unique<GPUHistMakerDevice>(
|
||||
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
|
||||
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised_ = true;
|
||||
@@ -895,21 +947,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
monitor_.Stop("InitDataOnce");
|
||||
}
|
||||
p_last_tree_ = p_tree;
|
||||
}
|
||||
|
||||
// Only call this method for testing
|
||||
void CheckTreesSynchronized(RegTree* local_tree) const {
|
||||
std::string s_model;
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = collective::GetRank();
|
||||
if (rank == 0) {
|
||||
local_tree->Save(&fs);
|
||||
}
|
||||
fs.Seek(0);
|
||||
collective::Broadcast(&s_model, 0);
|
||||
RegTree reference_tree{}; // rank 0 tree
|
||||
reference_tree.Load(&fs);
|
||||
CHECK(*local_tree == reference_tree);
|
||||
CHECK(hist_maker_param_.GetInitialised());
|
||||
}
|
||||
|
||||
void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
@@ -935,7 +973,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
MetaInfo* info_{}; // NOLINT
|
||||
|
||||
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
|
||||
std::unique_ptr<GPUHistMakerDevice> maker; // NOLINT
|
||||
|
||||
[[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
|
||||
[[nodiscard]] bool HasNodePosition() const override { return true; }
|
||||
@@ -943,13 +981,14 @@ class GPUHistMaker : public TreeUpdater {
|
||||
private:
|
||||
bool initialised_{false};
|
||||
|
||||
GPUHistMakerTrainParam hist_maker_param_;
|
||||
HistMakerTrainParam hist_maker_param_;
|
||||
|
||||
DMatrix* p_last_fmat_{nullptr};
|
||||
RegTree const* p_last_tree_{nullptr};
|
||||
ObjInfo const* task_{nullptr};
|
||||
|
||||
common::Monitor monitor_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
};
|
||||
|
||||
#if !defined(GTEST_TEST)
|
||||
@@ -959,4 +998,135 @@ XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||
return new GPUHistMaker(ctx, task);
|
||||
});
|
||||
#endif // !defined(GTEST_TEST)
|
||||
|
||||
class GPUGlobalApproxMaker : public TreeUpdater {
|
||||
public:
|
||||
explicit GPUGlobalApproxMaker(Context const* ctx, ObjInfo const* task)
|
||||
: TreeUpdater(ctx), task_{task} {};
|
||||
void Configure(Args const& args) override {
|
||||
// Used in test to count how many configurations are performed
|
||||
LOG(DEBUG) << "[GPU Approx]: Configure";
|
||||
hist_maker_param_.UpdateAllowUnknown(args);
|
||||
if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
|
||||
LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
|
||||
}
|
||||
dh::CheckComputeCapability();
|
||||
initialised_ = false;
|
||||
|
||||
monitor_.Init(this->Name());
|
||||
}
|
||||
|
||||
void LoadConfig(Json const& in) override {
|
||||
auto const& config = get<Object const>(in);
|
||||
FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
|
||||
initialised_ = false;
|
||||
}
|
||||
void SaveConfig(Json* p_out) const override {
|
||||
auto& out = *p_out;
|
||||
out["hist_train_param"] = ToJson(hist_maker_param_);
|
||||
}
|
||||
~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
|
||||
|
||||
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
monitor_.Start("Update");
|
||||
|
||||
this->InitDataOnce(p_fmat);
|
||||
// build tree
|
||||
hess_.resize(gpair->Size());
|
||||
auto hess = dh::ToSpan(hess_);
|
||||
|
||||
gpair->SetDevice(ctx_->Device());
|
||||
auto d_gpair = gpair->Data()->ConstDeviceSpan();
|
||||
auto cuctx = ctx_->CUDACtx();
|
||||
thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess),
|
||||
[=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); });
|
||||
|
||||
auto const& info = p_fmat->Info();
|
||||
info.feature_types.SetDevice(ctx_->Device());
|
||||
auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
|
||||
maker_ = std::make_unique<GPUHistMakerDevice>(
|
||||
ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
|
||||
*param, column_sampler_, info.num_col_, batch, p_fmat->Info());
|
||||
|
||||
std::size_t t_idx{0};
|
||||
for (xgboost::RegTree* tree : trees) {
|
||||
this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
|
||||
this->hist_maker_param_.CheckTreesSynchronized(tree);
|
||||
++t_idx;
|
||||
}
|
||||
|
||||
monitor_.Stop("Update");
|
||||
}
|
||||
|
||||
void InitDataOnce(DMatrix* p_fmat) {
|
||||
if (this->initialised_) {
|
||||
return;
|
||||
}
|
||||
|
||||
monitor_.Start(__func__);
|
||||
CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal();
|
||||
// Synchronise the column sampling seed
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
p_last_fmat_ = p_fmat;
|
||||
initialised_ = true;
|
||||
monitor_.Stop(__func__);
|
||||
}
|
||||
|
||||
void InitData(DMatrix* p_fmat, RegTree const* p_tree) {
|
||||
this->InitDataOnce(p_fmat);
|
||||
p_last_tree_ = p_tree;
|
||||
CHECK(hist_maker_param_.GetInitialised());
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
|
||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||
monitor_.Start("InitData");
|
||||
this->InitData(p_fmat, p_tree);
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(ctx_->gpu_id);
|
||||
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache(const DMatrix* data,
|
||||
linalg::MatrixView<bst_float> p_out_preds) override {
|
||||
if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
|
||||
return false;
|
||||
}
|
||||
monitor_.Start("UpdatePredictionCache");
|
||||
bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_);
|
||||
monitor_.Stop("UpdatePredictionCache");
|
||||
return result;
|
||||
}
|
||||
|
||||
[[nodiscard]] char const* Name() const override { return "grow_gpu_approx"; }
|
||||
[[nodiscard]] bool HasNodePosition() const override { return true; }
|
||||
|
||||
private:
|
||||
bool initialised_{false};
|
||||
|
||||
HistMakerTrainParam hist_maker_param_;
|
||||
dh::device_vector<float> hess_;
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_;
|
||||
std::unique_ptr<GPUHistMakerDevice> maker_;
|
||||
|
||||
DMatrix* p_last_fmat_{nullptr};
|
||||
RegTree const* p_last_tree_{nullptr};
|
||||
ObjInfo const* task_{nullptr};
|
||||
|
||||
common::Monitor monitor_;
|
||||
};
|
||||
|
||||
#if !defined(GTEST_TEST)
|
||||
XGBOOST_REGISTER_TREE_UPDATER(GPUApproxMaker, "grow_gpu_approx")
|
||||
.describe("Grow tree with GPU.")
|
||||
.set_body([](Context const* ctx, ObjInfo const* task) {
|
||||
return new GPUGlobalApproxMaker(ctx, task);
|
||||
});
|
||||
#endif // !defined(GTEST_TEST)
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -31,7 +31,7 @@ class TreePruner : public TreeUpdater {
|
||||
[[nodiscard]] bool CanModifyTree() const override { return true; }
|
||||
|
||||
// update the tree, do pruning
|
||||
void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
pruner_monitor_.Start("PrunerUpdate");
|
||||
|
||||
@@ -4,39 +4,40 @@
|
||||
* \brief use quantized feature values to construct a tree
|
||||
* \author Philip Cho, Tianqi Checn, Egor Smirnov
|
||||
*/
|
||||
#include <algorithm> // for max, copy, transform
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for uint32_t, int32_t
|
||||
#include <memory> // for unique_ptr, allocator, make_unique, shared_ptr
|
||||
#include <numeric> // for accumulate
|
||||
#include <ostream> // for basic_ostream, char_traits, operator<<
|
||||
#include <utility> // for move, swap
|
||||
#include <vector> // for vector
|
||||
#include <algorithm> // for max, copy, transform
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for uint32_t, int32_t
|
||||
#include <exception> // for exception
|
||||
#include <memory> // for allocator, unique_ptr, make_unique, shared_ptr
|
||||
#include <ostream> // for operator<<, basic_ostream, char_traits
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../collective/aggregator.h" // for GlobalSum
|
||||
#include "../collective/communicator-inl.h" // for Allreduce, IsDistributed
|
||||
#include "../collective/communicator.h" // for Operation
|
||||
#include "../common/hist_util.h" // for HistogramCuts, HistCollection
|
||||
#include "../collective/communicator-inl.h" // for IsDistributed
|
||||
#include "../common/hist_util.h" // for HistogramCuts, GHistRow
|
||||
#include "../common/linalg_op.h" // for begin, cbegin, cend
|
||||
#include "../common/random.h" // for ColumnSampler
|
||||
#include "../common/threading_utils.h" // for ParallelFor
|
||||
#include "../common/timer.h" // for Monitor
|
||||
#include "../common/transform_iterator.h" // for IndexTransformIter, MakeIndexTransformIter
|
||||
#include "../common/transform_iterator.h" // for IndexTransformIter
|
||||
#include "../data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "common_row_partitioner.h" // for CommonRowPartitioner
|
||||
#include "dmlc/omp.h" // for omp_get_thread_num
|
||||
#include "dmlc/registry.h" // for DMLC_REGISTRY_FILE_TAG
|
||||
#include "driver.h" // for Driver
|
||||
#include "hist/evaluate_splits.h" // for HistEvaluator, HistMultiEvaluator, UpdatePre...
|
||||
#include "hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
|
||||
#include "hist/histogram.h" // for HistogramBuilder, ConstructHistSpace
|
||||
#include "hist/hist_cache.h" // for BoundedHistCollection
|
||||
#include "hist/histogram.h" // for MultiHistogramBuilder
|
||||
#include "hist/param.h" // for HistMakerTrainParam
|
||||
#include "hist/sampler.h" // for SampleGradient
|
||||
#include "param.h" // for TrainParam, SplitEntryContainer, GradStats
|
||||
#include "xgboost/base.h" // for GradientPairInternal, GradientPair, bst_targ...
|
||||
#include "param.h" // for TrainParam, GradStats
|
||||
#include "xgboost/base.h" // for Args, GradientPairPrecise, GradientPair, Gra...
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for BatchIterator, BatchSet, DMatrix, MetaInfo
|
||||
#include "xgboost/data.h" // for BatchSet, DMatrix, BatchIterator, MetaInfo
|
||||
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
||||
#include "xgboost/linalg.h" // for All, MatrixView, TensorView, Matrix, Empty
|
||||
#include "xgboost/json.h" // for Object, Json, FromJson, ToJson, get
|
||||
#include "xgboost/linalg.h" // for MatrixView, TensorView, All, Matrix, Empty
|
||||
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
|
||||
#include "xgboost/span.h" // for Span, operator!=, SpanIterator
|
||||
#include "xgboost/string_view.h" // for operator<<
|
||||
@@ -117,10 +118,11 @@ class MultiTargetHistBuilder {
|
||||
private:
|
||||
common::Monitor *monitor_{nullptr};
|
||||
TrainParam const *param_{nullptr};
|
||||
HistMakerTrainParam const *hist_param_{nullptr};
|
||||
std::shared_ptr<common::ColumnSampler> col_sampler_;
|
||||
std::unique_ptr<HistMultiEvaluator> evaluator_;
|
||||
// Histogram builder for each target.
|
||||
std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
|
||||
std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
|
||||
Context const *ctx_{nullptr};
|
||||
// Partitioner for each data batch.
|
||||
std::vector<CommonRowPartitioner> partitioner_;
|
||||
@@ -150,7 +152,6 @@ class MultiTargetHistBuilder {
|
||||
monitor_->Start(__func__);
|
||||
|
||||
p_last_fmat_ = p_fmat;
|
||||
std::size_t page_id = 0;
|
||||
bst_bin_t n_total_bins = 0;
|
||||
partitioner_.clear();
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
@@ -160,16 +161,13 @@ class MultiTargetHistBuilder {
|
||||
CHECK_EQ(n_total_bins, page.cut.TotalBins());
|
||||
}
|
||||
partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->Info().IsColumnSplit());
|
||||
page_id++;
|
||||
}
|
||||
|
||||
bst_target_t n_targets = p_tree->NumTargets();
|
||||
histogram_builder_.clear();
|
||||
for (std::size_t i = 0; i < n_targets; ++i) {
|
||||
histogram_builder_.emplace_back();
|
||||
histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
|
||||
collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
|
||||
}
|
||||
histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
|
||||
histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_),
|
||||
collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
|
||||
hist_param_);
|
||||
|
||||
evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
|
||||
p_last_tree_ = p_tree;
|
||||
@@ -204,17 +202,7 @@ class MultiTargetHistBuilder {
|
||||
collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
|
||||
root_sum.Size() * 2);
|
||||
|
||||
std::vector<MultiExpandEntry> nodes{best};
|
||||
std::size_t i = 0;
|
||||
auto space = ConstructHistSpace(partitioner_, nodes);
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
for (bst_target_t t{0}; t < n_targets; ++t) {
|
||||
auto t_gpair = gpair.Slice(linalg::All(), t);
|
||||
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
|
||||
nodes, {}, t_gpair.Values());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
|
||||
|
||||
auto weight = evaluator_->InitRoot(root_sum);
|
||||
auto weight_t = weight.HostView();
|
||||
@@ -222,9 +210,10 @@ class MultiTargetHistBuilder {
|
||||
[&](float w) { return w * param_->learning_rate; });
|
||||
|
||||
p_tree->SetLeaf(RegTree::kRoot, weight_t);
|
||||
std::vector<common::HistCollection const *> hists;
|
||||
std::vector<BoundedHistCollection const *> hists;
|
||||
std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
|
||||
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
|
||||
hists.push_back(&histogram_builder_[t].Histogram());
|
||||
hists.push_back(&(*histogram_builder_).Histogram(t));
|
||||
}
|
||||
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
|
||||
@@ -239,50 +228,17 @@ class MultiTargetHistBuilder {
|
||||
std::vector<MultiExpandEntry> const &valid_candidates,
|
||||
linalg::MatrixView<GradientPair const> gpair) {
|
||||
monitor_->Start(__func__);
|
||||
std::vector<MultiExpandEntry> nodes_to_build;
|
||||
std::vector<MultiExpandEntry> nodes_to_sub;
|
||||
|
||||
for (auto const &c : valid_candidates) {
|
||||
auto left_nidx = p_tree->LeftChild(c.nid);
|
||||
auto right_nidx = p_tree->RightChild(c.nid);
|
||||
|
||||
auto build_nidx = left_nidx;
|
||||
auto subtract_nidx = right_nidx;
|
||||
auto lit =
|
||||
common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
|
||||
auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
|
||||
auto rit =
|
||||
common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
|
||||
auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
|
||||
auto fewer_right = right_sum < left_sum;
|
||||
if (fewer_right) {
|
||||
std::swap(build_nidx, subtract_nidx);
|
||||
}
|
||||
nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
|
||||
nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
|
||||
}
|
||||
|
||||
std::size_t i = 0;
|
||||
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
|
||||
for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
|
||||
auto t_gpair = gpair.Slice(linalg::All(), t);
|
||||
// Make sure the gradient matrix is f-order.
|
||||
CHECK(t_gpair.Contiguous());
|
||||
histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
|
||||
nodes_to_build, nodes_to_sub, t_gpair.Values());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
|
||||
HistBatch(param_));
|
||||
monitor_->Stop(__func__);
|
||||
}
|
||||
|
||||
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
|
||||
std::vector<MultiExpandEntry> *best_splits) {
|
||||
monitor_->Start(__func__);
|
||||
std::vector<common::HistCollection const *> hists;
|
||||
std::vector<BoundedHistCollection const *> hists;
|
||||
for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
|
||||
hists.push_back(&histogram_builder_[t].Histogram());
|
||||
hists.push_back(&(*histogram_builder_).Histogram(t));
|
||||
}
|
||||
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
|
||||
@@ -306,10 +262,12 @@ class MultiTargetHistBuilder {
|
||||
|
||||
public:
|
||||
explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
|
||||
HistMakerTrainParam const *hist_param,
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
ObjInfo const *task, common::Monitor *monitor)
|
||||
: monitor_{monitor},
|
||||
param_{param},
|
||||
hist_param_{hist_param},
|
||||
col_sampler_{std::move(column_sampler)},
|
||||
evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
|
||||
ctx_{ctx},
|
||||
@@ -331,10 +289,14 @@ class MultiTargetHistBuilder {
|
||||
}
|
||||
};
|
||||
|
||||
class HistBuilder {
|
||||
/**
|
||||
* @brief Tree updater for single-target trees.
|
||||
*/
|
||||
class HistUpdater {
|
||||
private:
|
||||
common::Monitor *monitor_;
|
||||
TrainParam const *param_;
|
||||
HistMakerTrainParam const *hist_param_{nullptr};
|
||||
std::shared_ptr<common::ColumnSampler> col_sampler_;
|
||||
std::unique_ptr<HistEvaluator> evaluator_;
|
||||
std::vector<CommonRowPartitioner> partitioner_;
|
||||
@@ -343,22 +305,22 @@ class HistBuilder {
|
||||
const RegTree *p_last_tree_{nullptr};
|
||||
DMatrix const *const p_last_fmat_{nullptr};
|
||||
|
||||
std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
|
||||
std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
|
||||
ObjInfo const *task_{nullptr};
|
||||
// Context for number of threads
|
||||
Context const *ctx_{nullptr};
|
||||
|
||||
public:
|
||||
explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
|
||||
common::Monitor *monitor)
|
||||
explicit HistUpdater(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||
TrainParam const *param, HistMakerTrainParam const *hist_param,
|
||||
DMatrix const *fmat, ObjInfo const *task, common::Monitor *monitor)
|
||||
: monitor_{monitor},
|
||||
param_{param},
|
||||
hist_param_{hist_param},
|
||||
col_sampler_{std::move(column_sampler)},
|
||||
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
|
||||
col_sampler_)},
|
||||
evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(), col_sampler_)},
|
||||
p_last_fmat_(fmat),
|
||||
histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
|
||||
histogram_builder_{new MultiHistogramBuilder},
|
||||
task_{task},
|
||||
ctx_{ctx} {
|
||||
monitor_->Init(__func__);
|
||||
@@ -381,7 +343,6 @@ class HistBuilder {
|
||||
// initialize temp data structure
|
||||
void InitData(DMatrix *fmat, RegTree const *p_tree) {
|
||||
monitor_->Start(__func__);
|
||||
std::size_t page_id{0};
|
||||
bst_bin_t n_total_bins{0};
|
||||
partitioner_.clear();
|
||||
for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
@@ -392,10 +353,9 @@ class HistBuilder {
|
||||
}
|
||||
partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid,
|
||||
fmat->Info().IsColumnSplit());
|
||||
++page_id;
|
||||
}
|
||||
histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
|
||||
collective::IsDistributed(), fmat->Info().IsColumnSplit());
|
||||
histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
|
||||
fmat->Info().IsColumnSplit(), hist_param_);
|
||||
evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
|
||||
p_last_tree_ = p_tree;
|
||||
monitor_->Stop(__func__);
|
||||
@@ -404,7 +364,7 @@ class HistBuilder {
|
||||
void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
|
||||
std::vector<CPUExpandEntry> *best_splits) {
|
||||
monitor_->Start(__func__);
|
||||
auto const &histograms = histogram_builder_->Histogram();
|
||||
auto const &histograms = histogram_builder_->Histogram(0);
|
||||
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
|
||||
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
|
||||
@@ -422,16 +382,8 @@ class HistBuilder {
|
||||
monitor_->Start(__func__);
|
||||
CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
|
||||
|
||||
std::size_t page_id = 0;
|
||||
auto space = ConstructHistSpace(partitioner_, {node});
|
||||
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
std::vector<CPUExpandEntry> nodes_to_build{node};
|
||||
std::vector<CPUExpandEntry> nodes_to_sub;
|
||||
this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
|
||||
partitioner_.at(page_id).Partitions(), nodes_to_build,
|
||||
nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
|
||||
++page_id;
|
||||
}
|
||||
this->histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, node,
|
||||
HistBatch(param_));
|
||||
|
||||
{
|
||||
GradientPairPrecise grad_stat;
|
||||
@@ -445,7 +397,7 @@ class HistBuilder {
|
||||
CHECK_GE(row_ptr.size(), 2);
|
||||
std::uint32_t const ibegin = row_ptr[0];
|
||||
std::uint32_t const iend = row_ptr[1];
|
||||
auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
|
||||
auto hist = this->histogram_builder_->Histogram(0)[RegTree::kRoot];
|
||||
auto begin = hist.data();
|
||||
for (std::uint32_t i = ibegin; i < iend; ++i) {
|
||||
GradientPairPrecise const &et = begin[i];
|
||||
@@ -468,7 +420,7 @@ class HistBuilder {
|
||||
monitor_->Start("EvaluateSplits");
|
||||
auto ft = p_fmat->Info().feature_types.ConstHostSpan();
|
||||
for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
|
||||
evaluator_->EvaluateSplits(histogram_builder_->Histogram(0), gmat.cut, ft, *p_tree,
|
||||
&entries);
|
||||
break;
|
||||
}
|
||||
@@ -484,33 +436,8 @@ class HistBuilder {
|
||||
std::vector<CPUExpandEntry> const &valid_candidates,
|
||||
linalg::MatrixView<GradientPair const> gpair) {
|
||||
monitor_->Start(__func__);
|
||||
std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
|
||||
std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
|
||||
|
||||
std::size_t n_idx = 0;
|
||||
for (auto const &c : valid_candidates) {
|
||||
auto left_nidx = (*p_tree)[c.nid].LeftChild();
|
||||
auto right_nidx = (*p_tree)[c.nid].RightChild();
|
||||
auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
|
||||
|
||||
auto build_nidx = left_nidx;
|
||||
auto subtract_nidx = right_nidx;
|
||||
if (fewer_right) {
|
||||
std::swap(build_nidx, subtract_nidx);
|
||||
}
|
||||
nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
|
||||
nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
|
||||
n_idx++;
|
||||
}
|
||||
|
||||
std::size_t page_id{0};
|
||||
auto space = ConstructHistSpace(partitioner_, nodes_to_build);
|
||||
for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
|
||||
histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
|
||||
partitioner_.at(page_id).Partitions(), nodes_to_build,
|
||||
nodes_to_sub, gpair.Values());
|
||||
++page_id;
|
||||
}
|
||||
this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
|
||||
gpair, HistBatch(param_));
|
||||
monitor_->Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -529,7 +456,7 @@ class HistBuilder {
|
||||
std::vector<bst_node_t> *p_out_position) {
|
||||
monitor_->Start(__func__);
|
||||
if (!task_->UpdateTreeLeaf()) {
|
||||
monitor_->Stop(__func__);
|
||||
monitor_->Stop(__func__);
|
||||
return;
|
||||
}
|
||||
for (auto const &part : partitioner_) {
|
||||
@@ -541,42 +468,50 @@ class HistBuilder {
|
||||
|
||||
/*! \brief construct a tree using quantized feature values */
|
||||
class QuantileHistMaker : public TreeUpdater {
|
||||
std::unique_ptr<HistBuilder> p_impl_{nullptr};
|
||||
std::unique_ptr<HistUpdater> p_impl_{nullptr};
|
||||
std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
|
||||
std::shared_ptr<common::ColumnSampler> column_sampler_ =
|
||||
std::make_shared<common::ColumnSampler>();
|
||||
common::Monitor monitor_;
|
||||
ObjInfo const *task_{nullptr};
|
||||
HistMakerTrainParam hist_param_;
|
||||
|
||||
public:
|
||||
explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
|
||||
: TreeUpdater{ctx}, task_{task} {}
|
||||
void Configure(const Args &) override {}
|
||||
|
||||
void LoadConfig(Json const &) override {}
|
||||
void SaveConfig(Json *) const override {}
|
||||
void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
|
||||
void LoadConfig(Json const &in) override {
|
||||
auto const &config = get<Object const>(in);
|
||||
FromJson(config.at("hist_train_param"), &hist_param_);
|
||||
}
|
||||
void SaveConfig(Json *p_out) const override {
|
||||
auto &out = *p_out;
|
||||
out["hist_train_param"] = ToJson(hist_param_);
|
||||
}
|
||||
|
||||
[[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
|
||||
|
||||
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
|
||||
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree *> &trees) override {
|
||||
if (trees.front()->IsMultiTarget()) {
|
||||
CHECK(hist_param_.GetInitialised());
|
||||
CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
|
||||
if (!p_mtimpl_) {
|
||||
this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
|
||||
ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
|
||||
ctx_, p_fmat->Info(), param, &hist_param_, column_sampler_, task_, &monitor_);
|
||||
}
|
||||
} else {
|
||||
CHECK(hist_param_.GetInitialised());
|
||||
if (!p_impl_) {
|
||||
p_impl_ =
|
||||
std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
|
||||
p_impl_ = std::make_unique<HistUpdater>(ctx_, column_sampler_, param, &hist_param_, p_fmat,
|
||||
task_, &monitor_);
|
||||
}
|
||||
}
|
||||
|
||||
bst_target_t n_targets = trees.front()->NumTargets();
|
||||
auto h_gpair =
|
||||
linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
|
||||
auto h_gpair = gpair->HostView();
|
||||
|
||||
linalg::Matrix<GradientPair> sample_out;
|
||||
auto h_sample_out = h_gpair;
|
||||
@@ -601,6 +536,8 @@ class QuantileHistMaker : public TreeUpdater {
|
||||
UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
|
||||
h_out_position, *tree_it);
|
||||
}
|
||||
|
||||
hist_param_.CheckTreesSynchronized(*tree_it);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -31,11 +31,14 @@ class TreeRefresher : public TreeUpdater {
|
||||
[[nodiscard]] char const *Name() const override { return "refresh"; }
|
||||
[[nodiscard]] bool CanModifyTree() const override { return true; }
|
||||
// update the tree, do pruning
|
||||
void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
|
||||
void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
|
||||
const std::vector<RegTree *> &trees) override {
|
||||
if (trees.size() == 0) return;
|
||||
const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
|
||||
if (trees.size() == 0) {
|
||||
return;
|
||||
}
|
||||
CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
|
||||
const std::vector<GradientPair> &gpair_h = gpair->Data()->ConstHostVector();
|
||||
// thread temporal space
|
||||
std::vector<std::vector<GradStats> > stemp;
|
||||
std::vector<RegTree::FVec> fvec_temp;
|
||||
|
||||
@@ -31,7 +31,7 @@ class TreeSyncher : public TreeUpdater {
|
||||
|
||||
[[nodiscard]] char const* Name() const override { return "prune"; }
|
||||
|
||||
void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
|
||||
void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
|
||||
common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
if (collective::GetWorldSize() == 1) return;
|
||||
|
||||
Reference in New Issue
Block a user