tune grid size

This commit is contained in:
amdsc21 2023-03-26 17:45:19 +02:00
parent 7ee4734d3a
commit 8c77e936d1

View File

@ -325,8 +325,13 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
// Allocate number of blocks such that each block has about kMinItemsPerBlock work // Allocate number of blocks such that each block has about kMinItemsPerBlock work
// Up to a maximum where the device is saturated // Up to a maximum where the device is saturated
#if defined(XGBOOST_USE_CUDA)
grid_size = std::min(grid_size, static_cast<std::uint32_t>( grid_size = std::min(grid_size, static_cast<std::uint32_t>(
common::DivRoundUp(items_per_group, kMinItemsPerBlock))); common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
#elif defined(XGBOOST_USE_HIP)
grid_size = std::min(common::DivRoundUp(grid_size, num_groups), static_cast<std::uint32_t>(
common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
#endif
dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size, dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(), ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),