finished histogram.cu

This commit is contained in:
amdsc21 2023-03-09 21:28:37 +01:00
parent f67e7de7ef
commit 0ed5d3c849
6 changed files with 69 additions and 2 deletions

View File

@ -13,10 +13,14 @@
#include <string>
#include <vector>
#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
#if defined(__CUDACC__)
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include "device_helpers.cuh"
#elif defined(__HIP_PLATFORM_AMD__)
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include "device_helpers.hip.h"
#endif // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
#include "xgboost/span.h"

View File

@ -11,8 +11,10 @@
#include "common.h"
#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
#if defined(__CUDACC__)
#include "device_helpers.cuh"
#elif defined(__HIP_PLATFORM_AMD__)
#include "device_helpers.hip.h"
#endif // __CUDACC__ || __HIP_PLATFORM_AMD__
namespace xgboost {

View File

@ -8,7 +8,13 @@
#include <xgboost/data.h>
#include "../common/compressed_iterator.h"
#if defined(XGBOOST_USE_CUDA)
#include "../common/device_helpers.cuh"
#elif defined(XGBOOST_USE_HIP)
#include "../common/device_helpers.hip.h"
#endif
#include "../common/hist_util.h"
#include "../common/categorical.h"
#include <thrust/binary_search.h>

View File

@ -9,7 +9,13 @@
#include <limits>
#include "../../common/deterministic.cuh"
#if defined(XGBOOST_USE_CUDA)
#include "../../common/device_helpers.cuh"
#elif defined(XGBOOST_USE_HIP)
#include "../../common/device_helpers.hip.h"
#endif
#include "../../data/ellpack_page.cuh"
#include "histogram.cuh"
#include "row_partitioner.cuh"
@ -59,8 +65,14 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
thrust::device_ptr<GradientPair const> gpair_beg{gpair.data()};
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
#if defined(XGBOOST_USE_CUDA)
Pair p =
dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
#elif defined(XGBOOST_USE_HIP)
Pair p =
dh::Reduce(thrust::hip::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
#endif
// Treat pair as array of 4 primitive types to allreduce
using ReduceT = typename decltype(p.first)::ValueT;
static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
@ -258,7 +270,13 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
bool force_global_memory) {
// decide whether to use shared memory
int device = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDevice(&device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDevice(&device));
#endif
// opt into maximum shared memory for the kernel if necessary
size_t max_shared_memory = dh::MaxSharedMemoryOptin(device);
@ -273,16 +291,28 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
auto runit = [&, kMinItemsPerBlock = kItemsPerTile](auto kernel) {
if (shared) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
max_shared_memory));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipFuncSetAttribute((const void *)kernel, hipFuncAttributeMaxDynamicSharedMemorySize,
max_shared_memory));
#endif
}
// determine the launch configuration
int num_groups = feature_groups.NumGroups();
int n_mps = 0;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
int n_blocks_per_mp = 0;
dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
#endif
kBlockThreads, smem_size));
// This gives the number of blocks to keep the device occupied
// Use this as the maximum number of blocks
@ -311,7 +341,11 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetLastError());
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetLastError());
#endif
}
} // namespace tree

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "histogram.cu"
#endif

View File

@ -7,7 +7,12 @@
#include <limits>
#include <vector>
#if defined(XGBOOST_USE_CUDA)
#include "../../common/device_helpers.cuh"
#elif defined(XGBOOST_USE_HIP)
#include "../../common/device_helpers.hip.h"
#endif
#include "xgboost/base.h"
#include "xgboost/context.h"
#include "xgboost/task.h"
@ -140,13 +145,25 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
});
size_t temp_bytes = 0;
if (tmp->empty()) {
#if defined(XGBOOST_USE_CUDA)
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
IndexFlagOp(), total_rows, stream);
#elif defined(XGBOOST_USE_HIP)
rocprim::inclusive_scan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
total_rows, IndexFlagOp(), stream);
#endif
tmp->resize(temp_bytes);
}
temp_bytes = tmp->size();
#if defined(XGBOOST_USE_CUDA)
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
discard_write_iterator, IndexFlagOp(), total_rows, stream);
#elif defined(XGBOOST_USE_HIP)
rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator, discard_write_iterator,
total_rows, IndexFlagOp(), stream);
#endif
constexpr int kBlockSize = 256;