add cuda to hip wrapper

2023-10-17 12:42:37 -07:00
parent ea19555474
commit ffbbc9c968
35 changed files with 60 additions and 509 deletions
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
      // default per-thread stream
    default: {
      dh::CUDAEvent e;
-#if defined(XGBOOST_USE_CUDA)
      e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
-#elif defined(XGBOOST_USE_HIP)
-      e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
-#endif
      dh::DefaultStream().Wait(e);
    }
  }
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -22,19 +22,11 @@ namespace cub = hipcub;
 namespace xgboost {
 namespace {
 auto SetDeviceToPtr(void const* ptr) {
-#if defined(XGBOOST_USE_CUDA)
  cudaPointerAttributes attr;
  dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
  int32_t ptr_device = attr.device;
  dh::safe_cuda(cudaSetDevice(ptr_device));
  return ptr_device;
-#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
-  hipPointerAttribute_t attr;
-  dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
-  int32_t ptr_device = attr.device;
-  dh::safe_cuda(hipSetDevice(ptr_device));
-  return ptr_device;
-#endif
 }

 template <typename T, int32_t D>
@@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
      // set data
      data->Resize(array.n);

-#if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
                                    cudaMemcpyDefault, ctx->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
-                                    hipMemcpyDefault, ctx->Stream()));
-#endif
    });
    return;
  }
@@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
  });
  bool non_dec = true;

-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
                           cudaMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
-                           hipMemcpyDeviceToHost));
-#endif

  CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
  size_t bytes = 0;
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
    CHECK_NE(device_idx_, Context::kCpuId);

-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_idx_));
-#elif defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device_idx_));
-#endif

    for (auto& json_col : json_columns) {
      auto column = ArrayInterface<1>(get<Object const>(json_col));
@@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 template <typename AdapterBatchT>
 std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
                         float missing) {
-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx));
-#endif

  IsValidFunctor is_valid(missing);
-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
-#endif

  auto n_samples = batch.NumRows();
  bst_feature_t n_features = batch.NumCols();
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
      n_rows(n_rows) {
  monitor_.Init("ellpack_page");

-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif

  monitor_.Start("InitCompressedData");
  InitCompressedData(device);
@@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
-#endif

  n_rows = dmat->Info().num_row_;

@@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
                                 common::Span<size_t> row_counts_span,
                                 common::Span<FeatureType const> feature_types, size_t row_stride,
                                 size_t n_rows, common::HistogramCuts const& cuts) {
-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif

  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
  CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
  auto d_row_ptr = dh::ToSpan(row_ptr);
-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
-                                hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
-#endif

  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
  auto null = accessor.NullValue();
@@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
    if (row_batch.data.DeviceCanRead()) {
      auto const& d_data = row_batch.data.ConstDeviceSpan();

-#if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(
          entries_d.data().get(), d_data.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(
-          entries_d.data().get(), d_data.data() + ent_cnt_begin,
-          n_entries * sizeof(Entry), hipMemcpyDefault));
-#endif
    } else {
      const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();

-#if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(
          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(
-          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
-          n_entries * sizeof(Entry), hipMemcpyDefault));
-#endif
    }

    const dim3 block3(32, 8, 1);  // 256 threads
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,11 +10,7 @@

 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
      // source is initialized to be the 0th page during construction, so when count_ is 0
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,

  int32_t current_device;

-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetDevice(&current_device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipGetDevice(&current_device));
-#endif

  auto get_device = [&]() -> int32_t {
    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
@@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    // ctx_.gpu_id = proxy->DeviceIdx();
    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());

-#if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(get_device()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(get_device()));
-#endif

    if (cols == 0) {
      cols = num_cols();
@@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  auto n_features = cols;
  CHECK_GE(n_features, 1) << "Data must has at least 1 column.";

-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(get_device()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(get_device()));
-#endif

  if (!ref) {
    HostDeviceVector<FeatureType> ft;
@@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  while (iter.Next()) {
    init_page();

-#if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(get_device()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(get_device()));
-#endif

    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                                                                      : adapter->DeviceIdx();
  CHECK_GE(device, 0);

-#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif

  Context ctx;
  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -57,11 +57,7 @@ template <typename AdapterBatchT>
 void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
                     int device_idx, float missing) {

-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx));
-#elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx));
-#endif

  IsValidFunctor is_valid(missing);
  // Count elements per row