From 185dbce21f90d9f8d4a8abd2a06e165486468b50 Mon Sep 17 00:00:00 2001
From: amdsc21 <96135754+amdsc21@users.noreply.github.com>
Date: Fri, 10 Mar 2023 04:26:09 +0100
Subject: [PATCH] finish ellpack_page.cu

---
 src/data/ellpack_page.cc  | 36 ++++++++++++++++++++++++++++++++++--
 src/data/ellpack_page.cu  | 36 +++++++++++++++++++++++++++++++-----
 src/data/ellpack_page.hip |  4 ++++
 3 files changed, 69 insertions(+), 7 deletions(-)
diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index b1f24506e..e3df86945 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -1,7 +1,7 @@
 /*!
  * Copyright 2019 XGBoost contributors
  */
-#ifndef XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_CUDA)
 
 #include <xgboost/data.h>
 
@@ -34,4 +34,36 @@ size_t EllpackPage::Size() const {
 
 }  // namespace xgboost
 
-#endif  // XGBOOST_USE_CUDA
+#elif !defined(XGBOOST_USE_HIP)
+
+#include <xgboost/data.h>
+
+// dummy implementation of EllpackPage in case HIP is not used
+namespace xgboost {
+
+class EllpackPageImpl {};
+
+EllpackPage::EllpackPage() = default;
+
+EllpackPage::EllpackPage(DMatrix*, const BatchParam&) {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with HIP but "
+                "EllpackPage is required";
+}
+
+EllpackPage::~EllpackPage() {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with HIP but "
+                "EllpackPage is required";
+}
+
+void EllpackPage::SetBaseRowId(std::size_t) {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with HIP but "
+                "EllpackPage is required";
+}
+size_t EllpackPage::Size() const {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with HIP but "
+                "EllpackPage is required";
+  return 0;
+}
+
+}  // namespace xgboost
+#endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index ed84d532f..fc46df4a7 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -13,7 +13,7 @@
 #include "gradient_index.h"
 #include "xgboost/data.h"
 
-#if defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_HIP)
 #include <rocprim/rocprim.hpp>
 #endif
 
@@ -91,7 +91,12 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
       row_stride(row_stride),
       n_rows(n_rows) {
   monitor_.Init("ellpack_page");
+
+#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(device));
+#endif
 
   monitor_.Start("InitCompressedData");
   InitCompressedData(device);
@@ -112,7 +117,12 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
+
+#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(param.gpu_id));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(param.gpu_id));
+#endif
 
   n_rows = dmat->Info().num_row_;
 
@@ -266,13 +276,11 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
 
 #elif defined (__HIP_PLATFORM_AMD__)
 
-  rocprim::inclusive_scan<decltype(key_value_index_iter), decltype(out), TupleScanOp<Tuple>>
-      (nullptr, temp_storage_bytes, key_value_index_iter, out, batch.Size(), TupleScanOp<Tuple>());
+  rocprim::inclusive_scan(nullptr, temp_storage_bytes, key_value_index_iter, out, batch.Size(), TupleScanOp<Tuple>());
 
   dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
 
-  rocprim::inclusive_scan<decltype(key_value_index_iter), decltype(out), TupleScanOp<Tuple>>
-      (temp_storage.data().get(), temp_storage_bytes, key_value_index_iter, out, batch.Size(),
+  rocprim::inclusive_scan(temp_storage.data().get(), temp_storage_bytes, key_value_index_iter, out, batch.Size(),
        TupleScanOp<Tuple>());
 
 #endif
@@ -302,7 +310,11 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
                                  common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, common::HistogramCuts const& cuts) {
+#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(device));
+#endif
 
   *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
   CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -529,14 +541,28 @@ void EllpackPageImpl::CreateHistIndices(int device,
     // copy data entries to device.
     if (row_batch.data.DeviceCanRead()) {
       auto const& d_data = row_batch.data.ConstDeviceSpan();
+
+#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(
           entries_d.data().get(), d_data.data() + ent_cnt_begin,
           n_entries * sizeof(Entry), cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+      dh::safe_cuda(hipMemcpyAsync(
+          entries_d.data().get(), d_data.data() + ent_cnt_begin,
+          n_entries * sizeof(Entry), hipMemcpyDefault));
+#endif
     } else {
       const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
+
+#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(
           entries_d.data().get(), data_vec.data() + ent_cnt_begin,
           n_entries * sizeof(Entry), cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+      dh::safe_cuda(hipMemcpyAsync(
+          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
+          n_entries * sizeof(Entry), hipMemcpyDefault));
+#endif
     }
 
     const dim3 block3(32, 8, 1);  // 256 threads
diff --git a/src/data/ellpack_page.hip b/src/data/ellpack_page.hip
index e69de29bb..697e9a021 100644
--- a/src/data/ellpack_page.hip
+++ b/src/data/ellpack_page.hip
@@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "ellpack_page.cu"
+#endif