Use cudaOccupancyMaxPotentialBlockSize to calculate the block size. (#5926)

This commit is contained in:
Jiaming Yuan 2020-07-23 14:24:42 +08:00 committed by GitHub
parent fbfbd525d8
commit a4de2f68e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -175,7 +175,11 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
} }
// determine the launch configuration // determine the launch configuration
unsigned block_threads = shared ? 1024 : 256; int min_grid_size;
int block_threads = 1024;
dh::safe_cuda(cudaOccupancyMaxPotentialBlockSize(
&min_grid_size, &block_threads, kernel, smem_size, 0));
int num_groups = feature_groups.NumGroups(); int num_groups = feature_groups.NumGroups();
int n_mps = 0; int n_mps = 0;
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
@ -199,7 +203,8 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
grid_size = common::DivRoundUp(grid_size, grid_size = common::DivRoundUp(grid_size,
common::DivRoundUp(num_groups, num_groups_threshold)); common::DivRoundUp(num_groups, num_groups_threshold));
dh::LaunchKernel {dim3(grid_size, num_groups), block_threads, smem_size} ( dh::LaunchKernel {
dim3(grid_size, num_groups), static_cast<uint32_t>(block_threads), smem_size} (
kernel, kernel,
matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding, matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding,
shared); shared);