From 02f5464fa67ee4ed71d4534d96a5f9f03069cee8 Mon Sep 17 00:00:00 2001 From: Hui Liu <96135754+amdsc21@users.noreply.github.com> Date: Mon, 30 Oct 2023 15:15:05 -0700 Subject: [PATCH] enable coll and comm --- src/collective/aggregator.hip.h | 4 ++++ src/collective/coll.cu | 8 ++++++++ src/collective/coll.hip | 4 ++++ src/collective/coll.hip.h | 4 ++++ src/collective/comm.cu | 14 ++++++++++++-- src/collective/comm.cuh | 9 ++++++++- src/collective/comm.hip | 4 ++++ src/collective/comm.hip.h | 4 ++++ src/common/cuda_to_hip.h | 2 ++ tests/cpp/collective/test_allgather.cu | 2 +- tests/cpp/collective/test_allgather.hip | 4 ++++ tests/cpp/collective/test_allreduce.cu | 2 +- tests/cpp/collective/test_allreduce.hip | 4 ++++ tests/cpp/common/test_transform_range.hip | 4 ++++ tests/cpp/gbm/test_gblinear.hip | 4 ++++ tests/cpp/gbm/test_gbtree.hip | 4 ++++ tests/cpp/test_context.hip | 0 tests/cpp/tree/gpu_hist/test_expand_entry.hip | 4 ++++ 18 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 src/collective/aggregator.hip.h create mode 100644 src/collective/coll.hip create mode 100644 src/collective/coll.hip.h create mode 100644 src/collective/comm.hip create mode 100644 src/collective/comm.hip.h create mode 100644 tests/cpp/collective/test_allgather.hip create mode 100644 tests/cpp/collective/test_allreduce.hip create mode 100644 tests/cpp/common/test_transform_range.hip create mode 100644 tests/cpp/gbm/test_gblinear.hip create mode 100644 tests/cpp/gbm/test_gbtree.hip create mode 100644 tests/cpp/test_context.hip create mode 100644 tests/cpp/tree/gpu_hist/test_expand_entry.hip diff --git a/src/collective/aggregator.hip.h b/src/collective/aggregator.hip.h new file mode 100644 index 000000000..fb8f3091a --- /dev/null +++ b/src/collective/aggregator.hip.h @@ -0,0 +1,4 @@ + +#pragma once + +#include "aggregator.cuh" diff --git a/src/collective/coll.cu b/src/collective/coll.cu index 9802dc096..6741a09b5 100644 --- a/src/collective/coll.cu +++ b/src/collective/coll.cu @@ -10,7 +10,11 @@ #include "allgather.h" // for AllgatherVOffset #include "coll.cuh" #include "comm.cuh" +#if defined(XGBOOST_USE_NCCL) #include "nccl.h" +#elif defined(XGBOOST_USE_RCCL) +#include "rccl.h" +#endif #include "xgboost/collective/result.h" // for Result #include "xgboost/span.h" // for Span @@ -29,7 +33,11 @@ Result GetNCCLResult(ncclResult_t code) { if (code == ncclUnhandledCudaError) { // nccl usually preserves the last error so we can get more details. auto err = cudaPeekAtLastError(); +#if defined(XGBOOST_USE_NCCL) ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; +#elif defined(XGBOOST_USE_RCCL) + ss << " CUDA error: " << thrust::system_error(err, thrust::hip_category()).what() << "\n"; +#endif } else if (code == ncclSystemError) { ss << " This might be caused by a network configuration issue. Please consider specifying " "the network interface for NCCL via environment variables listed in its reference: " diff --git a/src/collective/coll.hip b/src/collective/coll.hip new file mode 100644 index 000000000..8f3e09ac1 --- /dev/null +++ b/src/collective/coll.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "coll.cu" +#endif diff --git a/src/collective/coll.hip.h b/src/collective/coll.hip.h new file mode 100644 index 000000000..619cfdae9 --- /dev/null +++ b/src/collective/coll.hip.h @@ -0,0 +1,4 @@ + +#pragma once + +#include "coll.cuh" diff --git a/src/collective/comm.cu b/src/collective/comm.cu index 2fff9e71b..07dfafbef 100644 --- a/src/collective/comm.cu +++ b/src/collective/comm.cu @@ -36,12 +36,22 @@ Result GetUniqueId(Comm const& comm, ncclUniqueId* pid) { } inline constexpr std::size_t kUuidLength = - sizeof(std::declval().uuid) / sizeof(std::uint64_t); +#if defined(XGBOOST_USE_CUDA) + sizeof(std::declval().uuid) / sizeof(std::uint64_t); +#elif defined(XGBOOST_USE_HIP) + sizeof(hipUUID) / sizeof(uint64_t); +#endif void GetCudaUUID(xgboost::common::Span const& uuid, DeviceOrd device) { +#if defined(XGBOOST_USE_CUDA) cudaDeviceProp prob{}; dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal)); - std::memcpy(uuid.data(), static_cast(&(prob.uuid)), sizeof(prob.uuid)); + std::memcpy(uuid.data(), static_cast(&(prob.uuid)), sizeof(prob.uuid)); +#elif defined(XGBOOST_USE_HIP) + hipUUID id; + hipDeviceGetUuid(&id, device.ordinal); + std::memcpy(uuid.data(), static_cast(&id), sizeof(id)); +#endif } static std::string PrintUUID(xgboost::common::Span const& uuid) { diff --git a/src/collective/comm.cuh b/src/collective/comm.cuh index 559e4ad01..1439bafbb 100644 --- a/src/collective/comm.cuh +++ b/src/collective/comm.cuh @@ -3,8 +3,11 @@ */ #pragma once -#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL) +#if defined(XGBOOST_USE_NCCL) #include "nccl.h" +#elif defined(XGBOOST_USE_RCCL) +#include "../common/cuda_to_hip.h" +#include "rccl.h" #endif // XGBOOST_USE_NCCL #include "../common/device_helpers.cuh" #include "coll.h" @@ -17,7 +20,11 @@ inline Result GetCUDAResult(cudaError rc) { if (rc == cudaSuccess) { return Success(); } +#if defined(XGBOOST_USE_NCCL) std::string msg = thrust::system_error(rc, thrust::cuda_category()).what(); +#elif defined(XGBOOST_USE_RCCL) + std::string msg = thrust::system_error(rc, thrust::hip_category()).what(); +#endif return Fail(msg); } diff --git a/src/collective/comm.hip b/src/collective/comm.hip new file mode 100644 index 000000000..e8619d41f --- /dev/null +++ b/src/collective/comm.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "comm.cu" +#endif diff --git a/src/collective/comm.hip.h b/src/collective/comm.hip.h new file mode 100644 index 000000000..4fee44302 --- /dev/null +++ b/src/collective/comm.hip.h @@ -0,0 +1,4 @@ + +#pragma once + +#include "comm.cuh" diff --git a/src/common/cuda_to_hip.h b/src/common/cuda_to_hip.h index f56cb60a8..08042750a 100644 --- a/src/common/cuda_to_hip.h +++ b/src/common/cuda_to_hip.h @@ -6,7 +6,9 @@ #if defined(XGBOOST_USE_HIP) #define cudaSuccess hipSuccess +#define cudaError hipError_t #define cudaGetLastError hipGetLastError +#define cudaPeekAtLastError hipPeekAtLastError #define cudaStream_t hipStream_t #define cudaStreamCreate hipStreamCreate diff --git a/tests/cpp/collective/test_allgather.cu b/tests/cpp/collective/test_allgather.cu index 48f7c2615..a997b2324 100644 --- a/tests/cpp/collective/test_allgather.cu +++ b/tests/cpp/collective/test_allgather.cu @@ -1,7 +1,7 @@ /** * Copyright 2023, XGBoost Contributors */ -#if defined(XGBOOST_USE_NCCL) +#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL) #include #include // for device_vector #include // for equal diff --git a/tests/cpp/collective/test_allgather.hip b/tests/cpp/collective/test_allgather.hip new file mode 100644 index 000000000..d9d159c8e --- /dev/null +++ b/tests/cpp/collective/test_allgather.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "test_allgather.cu" +#endif diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu index af9a4e58f..c2bd7dd63 100644 --- a/tests/cpp/collective/test_allreduce.cu +++ b/tests/cpp/collective/test_allreduce.cu @@ -1,7 +1,7 @@ /** * Copyright 2023, XGBoost Contributors */ -#if defined(XGBOOST_USE_NCCL) +#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL) #include #include // for host_vector diff --git a/tests/cpp/collective/test_allreduce.hip b/tests/cpp/collective/test_allreduce.hip new file mode 100644 index 000000000..60603aa9f --- /dev/null +++ b/tests/cpp/collective/test_allreduce.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "test_allreduce.cu" +#endif diff --git a/tests/cpp/common/test_transform_range.hip b/tests/cpp/common/test_transform_range.hip new file mode 100644 index 000000000..7c219a273 --- /dev/null +++ b/tests/cpp/common/test_transform_range.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "test_transform_range.cu" +#endif diff --git a/tests/cpp/gbm/test_gblinear.hip b/tests/cpp/gbm/test_gblinear.hip new file mode 100644 index 000000000..88ad10d45 --- /dev/null +++ b/tests/cpp/gbm/test_gblinear.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "test_gblinear.cu" +#endif diff --git a/tests/cpp/gbm/test_gbtree.hip b/tests/cpp/gbm/test_gbtree.hip new file mode 100644 index 000000000..1b21f4804 --- /dev/null +++ b/tests/cpp/gbm/test_gbtree.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "test_gbtree.cu" +#endif diff --git a/tests/cpp/test_context.hip b/tests/cpp/test_context.hip new file mode 100644 index 000000000..e69de29bb diff --git a/tests/cpp/tree/gpu_hist/test_expand_entry.hip b/tests/cpp/tree/gpu_hist/test_expand_entry.hip new file mode 100644 index 000000000..fe5fdee88 --- /dev/null +++ b/tests/cpp/tree/gpu_hist/test_expand_entry.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "test_expand_entry.cu" +#endif