[EM] Handle base idx in GPU histogram. (#10549)
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
/**
|
||||
* Copyright 2020-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/iterator/transform_iterator.h>
|
||||
#include <thrust/reduce.h>
|
||||
#include <thrust/iterator/transform_iterator.h> // for make_transform_iterator
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint> // uint32_t, int32_t
|
||||
@@ -101,9 +100,8 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
|
||||
static_cast<T>(1) / to_floating_point_.GetHess());
|
||||
}
|
||||
|
||||
XGBOOST_DEV_INLINE void
|
||||
AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
|
||||
xgboost::GradientPairInt64 const &gpair) {
|
||||
XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64* dest,
|
||||
xgboost::GradientPairInt64 const& gpair) {
|
||||
auto dst_ptr = reinterpret_cast<int64_t *>(dest);
|
||||
auto g = gpair.GetQuantisedGrad();
|
||||
auto h = gpair.GetQuantisedHess();
|
||||
@@ -131,7 +129,9 @@ template <int kBlockThreads, int kItemsPerThread,
|
||||
class HistogramAgent {
|
||||
GradientPairInt64* smem_arr_;
|
||||
GradientPairInt64* d_node_hist_;
|
||||
dh::LDGIterator<const RowPartitioner::RowIndexT> d_ridx_;
|
||||
using Idx = RowPartitioner::RowIndexT;
|
||||
|
||||
dh::LDGIterator<const Idx> d_ridx_;
|
||||
const GradientPair* d_gpair_;
|
||||
const FeatureGroup group_;
|
||||
const EllpackDeviceAccessor& matrix_;
|
||||
@@ -142,8 +142,7 @@ class HistogramAgent {
|
||||
public:
|
||||
__device__ HistogramAgent(GradientPairInt64* smem_arr,
|
||||
GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
|
||||
const EllpackDeviceAccessor& matrix,
|
||||
common::Span<const RowPartitioner::RowIndexT> d_ridx,
|
||||
const EllpackDeviceAccessor& matrix, common::Span<const Idx> d_ridx,
|
||||
const GradientQuantiser& rounding, const GradientPair* d_gpair)
|
||||
: smem_arr_(smem_arr),
|
||||
d_node_hist_(d_node_hist),
|
||||
@@ -154,15 +153,15 @@ class HistogramAgent {
|
||||
n_elements_(feature_stride_ * d_ridx.size()),
|
||||
rounding_(rounding),
|
||||
d_gpair_(d_gpair) {}
|
||||
|
||||
__device__ void ProcessPartialTileShared(std::size_t offset) {
|
||||
for (std::size_t idx = offset + threadIdx.x;
|
||||
idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
|
||||
idx += kBlockThreads) {
|
||||
int ridx = d_ridx_[idx / feature_stride_];
|
||||
int gidx =
|
||||
matrix_
|
||||
.gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_] -
|
||||
group_.start_bin;
|
||||
Idx ridx = d_ridx_[idx / feature_stride_];
|
||||
Idx midx = (ridx - matrix_.base_rowid) * matrix_.row_stride + group_.start_feature +
|
||||
idx % feature_stride_;
|
||||
bst_bin_t gidx = matrix_.gidx_iter[midx] - group_.start_bin;
|
||||
if (matrix_.is_dense || gidx != matrix_.NumBins()) {
|
||||
auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
|
||||
AtomicAddGpairShared(smem_arr_ + gidx, adjusted);
|
||||
@@ -188,8 +187,8 @@ class HistogramAgent {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < kItemsPerThread; i++) {
|
||||
gpair[i] = d_gpair_[ridx[i]];
|
||||
gidx[i] = matrix_.gidx_iter[ridx[i] * matrix_.row_stride + group_.start_feature +
|
||||
idx[i] % feature_stride_];
|
||||
gidx[i] = matrix_.gidx_iter[(ridx[i] - matrix_.base_rowid) * matrix_.row_stride +
|
||||
group_.start_feature + idx[i] % feature_stride_];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < kItemsPerThread; i++) {
|
||||
@@ -200,7 +199,7 @@ class HistogramAgent {
|
||||
}
|
||||
}
|
||||
__device__ void BuildHistogramWithShared() {
|
||||
dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64());
|
||||
dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64{});
|
||||
__syncthreads();
|
||||
|
||||
std::size_t offset = blockIdx.x * kItemsPerTile;
|
||||
@@ -219,10 +218,9 @@ class HistogramAgent {
|
||||
|
||||
__device__ void BuildHistogramWithGlobal() {
|
||||
for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n_elements_)) {
|
||||
int ridx = d_ridx_[idx / feature_stride_];
|
||||
int gidx =
|
||||
matrix_
|
||||
.gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_];
|
||||
Idx ridx = d_ridx_[idx / feature_stride_];
|
||||
bst_bin_t gidx = matrix_.gidx_iter[(ridx - matrix_.base_rowid) * matrix_.row_stride +
|
||||
group_.start_feature + idx % feature_stride_];
|
||||
if (matrix_.is_dense || gidx != matrix_.NumBins()) {
|
||||
auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
|
||||
AtomicAddGpairGlobal(d_node_hist_ + gidx, adjusted);
|
||||
@@ -231,8 +229,7 @@ class HistogramAgent {
|
||||
}
|
||||
};
|
||||
|
||||
template <bool use_shared_memory_histograms, int kBlockThreads,
|
||||
int kItemsPerThread>
|
||||
template <bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
|
||||
__global__ void __launch_bounds__(kBlockThreads)
|
||||
SharedMemHistKernel(const EllpackDeviceAccessor matrix,
|
||||
const FeatureGroupsAccessor feature_groups,
|
||||
@@ -251,6 +248,7 @@ __global__ void __launch_bounds__(kBlockThreads)
|
||||
agent.BuildHistogramWithGlobal();
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
constexpr std::int32_t kBlockThreads = 1024;
|
||||
constexpr std::int32_t kItemsPerThread = 8;
|
||||
|
||||
@@ -78,5 +78,4 @@ class DeviceHistogramBuilder {
|
||||
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
|
||||
};
|
||||
} // namespace xgboost::tree
|
||||
|
||||
#endif // HISTOGRAM_CUH_
|
||||
|
||||
@@ -1,28 +1,23 @@
|
||||
/*!
|
||||
* Copyright 2017-2022 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/iterator/transform_output_iterator.h>
|
||||
#include <thrust/sequence.h>
|
||||
#include <thrust/sequence.h> // for sequence
|
||||
|
||||
#include <vector>
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "../../common/cuda_context.cuh" // for CUDAContext
|
||||
#include "../../common/device_helpers.cuh" // for CopyDeviceSpanToVector, ToSpan
|
||||
#include "row_partitioner.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
|
||||
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
|
||||
namespace xgboost::tree {
|
||||
RowPartitioner::RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid)
|
||||
: device_idx_(ctx->Device()), ridx_(n_samples), ridx_tmp_(n_samples) {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
|
||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
||||
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, n_samples)});
|
||||
thrust::sequence(ctx->CUDACtx()->CTP(), ridx_.data(), ridx_.data() + ridx_.size(), base_rowid);
|
||||
}
|
||||
|
||||
RowPartitioner::~RowPartitioner() {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
|
||||
}
|
||||
RowPartitioner::~RowPartitioner() { dh::safe_cuda(cudaSetDevice(device_idx_.ordinal)); }
|
||||
|
||||
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
|
||||
auto segment = ridx_segments_.at(nidx).segment;
|
||||
@@ -39,6 +34,4 @@ std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(bst_node_t ni
|
||||
dh::CopyDeviceSpanToVector(&rows, span);
|
||||
return rows;
|
||||
}
|
||||
|
||||
}; // namespace tree
|
||||
}; // namespace xgboost
|
||||
}; // namespace xgboost::tree
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
/*!
|
||||
* Copyright 2017-2022 XGBoost contributors
|
||||
/**
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
|
||||
#include <thrust/iterator/transform_output_iterator.h> // for make_transform_output_iterator
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <algorithm> // for max
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/context.h"
|
||||
#include "xgboost/task.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
#include "../../common/device_helpers.cuh" // for MakeTransformIterator
|
||||
#include "xgboost/base.h" // for bst_idx_t
|
||||
#include "xgboost/context.h" // for Context
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -223,7 +223,12 @@ class RowPartitioner {
|
||||
dh::PinnedMemory pinned2_;
|
||||
|
||||
public:
|
||||
RowPartitioner(DeviceOrd device_idx, size_t num_rows);
|
||||
/**
|
||||
* @param ctx Context for device ordinal and stream.
|
||||
* @param n_samples The number of samples in each batch.
|
||||
* @param base_rowid The base row index for the current batch.
|
||||
*/
|
||||
RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid);
|
||||
~RowPartitioner();
|
||||
RowPartitioner(const RowPartitioner&) = delete;
|
||||
RowPartitioner& operator=(const RowPartitioner&) = delete;
|
||||
|
||||
@@ -251,7 +251,8 @@ struct GPUHistMakerDevice {
|
||||
quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
|
||||
|
||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||
row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
|
||||
CHECK_EQ(page->base_rowid, 0);
|
||||
row_partitioner = std::make_unique<RowPartitioner>(ctx_, sample.sample_rows, page->base_rowid);
|
||||
|
||||
// Init histogram
|
||||
hist.Init(ctx_->Device(), page->Cuts().TotalBins());
|
||||
|
||||
Reference in New Issue
Block a user