finished histogram.cu

2023-03-09 21:28:37 +01:00 · 2023-03-09 21:28:37 +01:00 · 0ed5d3c849
commit 0ed5d3c849
parent f67e7de7ef
6 changed files with 69 additions and 2 deletions
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@ -13,10 +13,14 @@
 #include <string>
 #include <vector>

-#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDACC__)
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include "device_helpers.cuh"
+#elif defined(__HIP_PLATFORM_AMD__)
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include "device_helpers.hip.h"
 #endif  // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)

 #include "xgboost/span.h"
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@ -11,8 +11,10 @@

 #include "common.h"

-#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDACC__)
 #include "device_helpers.cuh"
+#elif defined(__HIP_PLATFORM_AMD__)
+#include "device_helpers.hip.h"
 #endif  // __CUDACC__ || __HIP_PLATFORM_AMD__

 namespace xgboost {
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@ -8,7 +8,13 @@
 #include <xgboost/data.h>

 #include "../common/compressed_iterator.h"
+
+#if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../common/device_helpers.hip.h"
+#endif
+
 #include "../common/hist_util.h"
 #include "../common/categorical.h"
 #include <thrust/binary_search.h>
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@ -9,7 +9,13 @@
 #include <limits>

 #include "../../common/deterministic.cuh"
+
+#if defined(XGBOOST_USE_CUDA)
 #include "../../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../../common/device_helpers.hip.h"
+#endif
+
 #include "../../data/ellpack_page.cuh"
 #include "histogram.cuh"
 #include "row_partitioner.cuh"
@ -59,8 +65,14 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {

  thrust::device_ptr<GradientPair const> gpair_beg{gpair.data()};
  auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
+#if defined(XGBOOST_USE_CUDA)
  Pair p =
      dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
+#elif defined(XGBOOST_USE_HIP)
+  Pair p =
+      dh::Reduce(thrust::hip::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
+#endif
+
  // Treat pair as array of 4 primitive types to allreduce
  using ReduceT = typename decltype(p.first)::ValueT;
  static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
@ -258,7 +270,13 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
                            bool force_global_memory) {
  // decide whether to use shared memory
  int device = 0;
+
+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetDevice(&device));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipGetDevice(&device));
+#endif
+
  // opt into maximum shared memory for the kernel if necessary
  size_t max_shared_memory = dh::MaxSharedMemoryOptin(device);

@ -273,16 +291,28 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&

  auto runit = [&, kMinItemsPerBlock = kItemsPerTile](auto kernel) {
    if (shared) {
+#if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                                         max_shared_memory));
+#elif defined(XGBOOST_USE_HIP)
+      dh::safe_cuda(hipFuncSetAttribute((const void *)kernel, hipFuncAttributeMaxDynamicSharedMemorySize,
+                                         max_shared_memory));
+#endif
    }

    // determine the launch configuration
    int num_groups = feature_groups.NumGroups();
    int n_mps = 0;
+
+#if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
    int n_blocks_per_mp = 0;
    dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
+    int n_blocks_per_mp = 0;
+    dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+#endif
                                                                kBlockThreads, smem_size));
    // This gives the number of blocks to keep the device occupied
    // Use this as the maximum number of blocks
@ -311,7 +341,11 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
    runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
  }

+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetLastError());
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipGetLastError());
+#endif
 }

 }  // namespace tree
--- a/src/tree/gpu_hist/histogram.hip
+++ b/src/tree/gpu_hist/histogram.hip
@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "histogram.cu"
+#endif
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@ -7,7 +7,12 @@
 #include <limits>
 #include <vector>

+#if defined(XGBOOST_USE_CUDA)
 #include "../../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../../common/device_helpers.hip.h"
+#endif
+
 #include "xgboost/base.h"
 #include "xgboost/context.h"
 #include "xgboost/task.h"
@ -140,13 +145,25 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
      });
  size_t temp_bytes = 0;
  if (tmp->empty()) {
+#if defined(XGBOOST_USE_CUDA)
    cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
                                   IndexFlagOp(), total_rows, stream);
+#elif defined(XGBOOST_USE_HIP)
+    rocprim::inclusive_scan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
+                                   total_rows, IndexFlagOp(), stream);
+#endif
+
    tmp->resize(temp_bytes);
  }
  temp_bytes = tmp->size();
+
+#if defined(XGBOOST_USE_CUDA)
  cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
+#elif defined(XGBOOST_USE_HIP)
+  rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator, discard_write_iterator,
+                                   total_rows, IndexFlagOp(), stream);
+#endif

  constexpr int kBlockSize = 256;