For histograms, opting into maximum shared memory available per block. (#5491)
This commit is contained in:
parent
9c1103e06c
commit
73142041b9
@ -100,7 +100,7 @@ inline size_t TotalMemory(int device_idx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \fn inline int max_shared_memory(int device_idx)
|
* \fn inline int MaxSharedMemory(int device_idx)
|
||||||
*
|
*
|
||||||
* \brief Maximum shared memory per block on this device.
|
* \brief Maximum shared memory per block on this device.
|
||||||
*
|
*
|
||||||
@ -113,6 +113,23 @@ inline size_t MaxSharedMemory(int device_idx) {
|
|||||||
return prop.sharedMemPerBlock;
|
return prop.sharedMemPerBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn inline int MaxSharedMemoryOptin(int device_idx)
|
||||||
|
*
|
||||||
|
* \brief Maximum dynamic shared memory per thread block on this device
|
||||||
|
that can be opted into when using cudaFuncSetAttribute().
|
||||||
|
*
|
||||||
|
* \param device_idx Zero-based index of the device.
|
||||||
|
*/
|
||||||
|
|
||||||
|
inline size_t MaxSharedMemoryOptin(int device_idx) {
|
||||||
|
int max_shared_memory = 0;
|
||||||
|
dh::safe_cuda(cudaDeviceGetAttribute
|
||||||
|
(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
|
||||||
|
device_idx));
|
||||||
|
return size_t(max_shared_memory);
|
||||||
|
}
|
||||||
|
|
||||||
inline void CheckComputeCapability() {
|
inline void CheckComputeCapability() {
|
||||||
for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
|
for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
|||||||
@ -150,21 +150,37 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
|
|||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> d_ridx,
|
common::Span<const uint32_t> d_ridx,
|
||||||
common::Span<GradientSumT> histogram,
|
common::Span<GradientSumT> histogram,
|
||||||
GradientSumT rounding, bool shared) {
|
GradientSumT rounding) {
|
||||||
const size_t smem_size =
|
// decide whether to use shared memory
|
||||||
shared
|
int device = 0;
|
||||||
? sizeof(GradientSumT) * matrix.NumBins()
|
dh::safe_cuda(cudaGetDevice(&device));
|
||||||
: 0;
|
int max_shared_memory = dh::MaxSharedMemoryOptin(device);
|
||||||
auto n_elements = d_ridx.size() * matrix.row_stride;
|
size_t smem_size = sizeof(GradientSumT) * matrix.NumBins();
|
||||||
|
bool shared = smem_size <= max_shared_memory;
|
||||||
|
smem_size = shared ? smem_size : 0;
|
||||||
|
|
||||||
uint32_t items_per_thread = 8;
|
// opt into maximum shared memory for the kernel if necessary
|
||||||
uint32_t block_threads = 256;
|
auto kernel = SharedMemHistKernel<GradientSumT>;
|
||||||
auto grid_size = static_cast<uint32_t>(
|
if (shared) {
|
||||||
common::DivRoundUp(n_elements, items_per_thread * block_threads));
|
dh::safe_cuda(cudaFuncSetAttribute
|
||||||
|
(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||||
|
max_shared_memory));
|
||||||
|
}
|
||||||
|
|
||||||
|
// determine the launch configuration
|
||||||
|
unsigned block_threads = shared ? 1024 : 256;
|
||||||
|
int n_mps = 0;
|
||||||
|
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||||
|
int n_blocks_per_mp = 0;
|
||||||
|
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||||
|
(&n_blocks_per_mp, kernel, block_threads, smem_size));
|
||||||
|
unsigned grid_size = n_blocks_per_mp * n_mps;
|
||||||
|
|
||||||
|
auto n_elements = d_ridx.size() * matrix.row_stride;
|
||||||
dh::LaunchKernel {grid_size, block_threads, smem_size} (
|
dh::LaunchKernel {grid_size, block_threads, smem_size} (
|
||||||
SharedMemHistKernel<GradientSumT>,
|
kernel, matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
|
||||||
matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
|
|
||||||
rounding, shared);
|
rounding, shared);
|
||||||
|
dh::safe_cuda(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
template void BuildGradientHistogram<GradientPair>(
|
template void BuildGradientHistogram<GradientPair>(
|
||||||
@ -172,13 +188,14 @@ template void BuildGradientHistogram<GradientPair>(
|
|||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> ridx,
|
common::Span<const uint32_t> ridx,
|
||||||
common::Span<GradientPair> histogram,
|
common::Span<GradientPair> histogram,
|
||||||
GradientPair rounding, bool shared);
|
GradientPair rounding);
|
||||||
|
|
||||||
template void BuildGradientHistogram<GradientPairPrecise>(
|
template void BuildGradientHistogram<GradientPairPrecise>(
|
||||||
EllpackDeviceAccessor const& matrix,
|
EllpackDeviceAccessor const& matrix,
|
||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> ridx,
|
common::Span<const uint32_t> ridx,
|
||||||
common::Span<GradientPairPrecise> histogram,
|
common::Span<GradientPairPrecise> histogram,
|
||||||
GradientPairPrecise rounding, bool shared);
|
GradientPairPrecise rounding);
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -22,7 +22,7 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
|
|||||||
common::Span<GradientPair const> gpair,
|
common::Span<GradientPair const> gpair,
|
||||||
common::Span<const uint32_t> ridx,
|
common::Span<const uint32_t> ridx,
|
||||||
common::Span<GradientSumT> histogram,
|
common::Span<GradientSumT> histogram,
|
||||||
GradientSumT rounding, bool shared);
|
GradientSumT rounding);
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
|
|||||||
@ -422,7 +422,6 @@ struct GPUHistMakerDevice {
|
|||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
bool deterministic_histogram;
|
bool deterministic_histogram;
|
||||||
bool use_shared_memory_histograms {false};
|
|
||||||
|
|
||||||
GradientSumT histogram_rounding;
|
GradientSumT histogram_rounding;
|
||||||
|
|
||||||
@ -596,7 +595,7 @@ struct GPUHistMakerDevice {
|
|||||||
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
||||||
auto d_ridx = row_partitioner->GetRows(nidx);
|
auto d_ridx = row_partitioner->GetRows(nidx);
|
||||||
BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
|
BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
|
||||||
histogram_rounding, use_shared_memory_histograms);
|
histogram_rounding);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SubtractionTrick(int nidx_parent, int nidx_histogram,
|
void SubtractionTrick(int nidx_parent, int nidx_histogram,
|
||||||
@ -910,15 +909,6 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
|
|||||||
host_node_sum_gradients.resize(param.MaxNodes());
|
host_node_sum_gradients.resize(param.MaxNodes());
|
||||||
node_sum_gradients.resize(param.MaxNodes());
|
node_sum_gradients.resize(param.MaxNodes());
|
||||||
|
|
||||||
// check if we can use shared memory for building histograms
|
|
||||||
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
|
|
||||||
// hiding)
|
|
||||||
auto histogram_size = sizeof(GradientSumT) * page->Cuts().TotalBins();
|
|
||||||
auto max_smem = dh::MaxSharedMemory(device_id);
|
|
||||||
if (histogram_size <= max_smem) {
|
|
||||||
use_shared_memory_histograms = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Init histogram
|
// Init histogram
|
||||||
hist.Init(device_id, page->Cuts().TotalBins());
|
hist.Init(device_id, page->Cuts().TotalBins());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,7 +27,7 @@ void TestDeterminsticHistogram() {
|
|||||||
|
|
||||||
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
||||||
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
||||||
d_histogram, rounding, true);
|
d_histogram, rounding);
|
||||||
|
|
||||||
for (size_t i = 0; i < kRounds; ++i) {
|
for (size_t i = 0; i < kRounds; ++i) {
|
||||||
dh::device_vector<Gradient> new_histogram(kBins * kCols);
|
dh::device_vector<Gradient> new_histogram(kBins * kCols);
|
||||||
@ -35,7 +35,7 @@ void TestDeterminsticHistogram() {
|
|||||||
|
|
||||||
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
|
||||||
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
||||||
d_histogram, rounding, true);
|
d_histogram, rounding);
|
||||||
|
|
||||||
for (size_t j = 0; j < new_histogram.size(); ++j) {
|
for (size_t j = 0; j < new_histogram.size(); ++j) {
|
||||||
ASSERT_EQ(((Gradient)new_histogram[j]).GetGrad(),
|
ASSERT_EQ(((Gradient)new_histogram[j]).GetGrad(),
|
||||||
@ -50,7 +50,7 @@ void TestDeterminsticHistogram() {
|
|||||||
gpair.SetDevice(0);
|
gpair.SetDevice(0);
|
||||||
dh::device_vector<Gradient> baseline(kBins * kCols);
|
dh::device_vector<Gradient> baseline(kBins * kCols);
|
||||||
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
|
||||||
dh::ToSpan(baseline), rounding, true);
|
dh::ToSpan(baseline), rounding);
|
||||||
for (size_t i = 0; i < baseline.size(); ++i) {
|
for (size_t i = 0; i < baseline.size(); ++i) {
|
||||||
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
|
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
|
||||||
((Gradient)baseline[i]).GetGrad() * 1e-3);
|
((Gradient)baseline[i]).GetGrad() * 1e-3);
|
||||||
|
|||||||
@ -101,7 +101,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
|||||||
maker.hist.AllocateHistogram(0);
|
maker.hist.AllocateHistogram(0);
|
||||||
maker.gpair = gpair.DeviceSpan();
|
maker.gpair = gpair.DeviceSpan();
|
||||||
|
|
||||||
maker.use_shared_memory_histograms = use_shared_memory_histograms;
|
|
||||||
maker.BuildHist(0);
|
maker.BuildHist(0);
|
||||||
DeviceHistogram<GradientSumT> d_hist = maker.hist;
|
DeviceHistogram<GradientSumT> d_hist = maker.hist;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user