enable coll and comm

This commit is contained in:
Hui Liu 2023-10-30 15:15:05 -07:00
parent b6b5218245
commit 02f5464fa6
18 changed files with 76 additions and 5 deletions

View File

@ -0,0 +1,4 @@
#pragma once
#include "aggregator.cuh"

View File

@ -10,7 +10,11 @@
#include "allgather.h" // for AllgatherVOffset #include "allgather.h" // for AllgatherVOffset
#include "coll.cuh" #include "coll.cuh"
#include "comm.cuh" #include "comm.cuh"
#if defined(XGBOOST_USE_NCCL)
#include "nccl.h" #include "nccl.h"
#elif defined(XGBOOST_USE_RCCL)
#include "rccl.h"
#endif
#include "xgboost/collective/result.h" // for Result #include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span #include "xgboost/span.h" // for Span
@ -29,7 +33,11 @@ Result GetNCCLResult(ncclResult_t code) {
if (code == ncclUnhandledCudaError) { if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details. // nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError(); auto err = cudaPeekAtLastError();
#if defined(XGBOOST_USE_NCCL)
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
#elif defined(XGBOOST_USE_RCCL)
ss << " CUDA error: " << thrust::system_error(err, thrust::hip_category()).what() << "\n";
#endif
} else if (code == ncclSystemError) { } else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying " ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: " "the network interface for NCCL via environment variables listed in its reference: "

4
src/collective/coll.hip Normal file
View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "coll.cu"
#endif

View File

@ -0,0 +1,4 @@
#pragma once
#include "coll.cuh"

View File

@ -36,12 +36,22 @@ Result GetUniqueId(Comm const& comm, ncclUniqueId* pid) {
} }
inline constexpr std::size_t kUuidLength = inline constexpr std::size_t kUuidLength =
#if defined(XGBOOST_USE_CUDA)
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t); sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t);
#elif defined(XGBOOST_USE_HIP)
sizeof(hipUUID) / sizeof(uint64_t);
#endif
void GetCudaUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid, DeviceOrd device) { void GetCudaUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid, DeviceOrd device) {
#if defined(XGBOOST_USE_CUDA)
cudaDeviceProp prob{}; cudaDeviceProp prob{};
dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal)); dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal));
std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid)); std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
#elif defined(XGBOOST_USE_HIP)
hipUUID id;
hipDeviceGetUuid(&id, device.ordinal);
std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
#endif
} }
static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid) { static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid) {

View File

@ -3,8 +3,11 @@
*/ */
#pragma once #pragma once
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL) #if defined(XGBOOST_USE_NCCL)
#include "nccl.h" #include "nccl.h"
#elif defined(XGBOOST_USE_RCCL)
#include "../common/cuda_to_hip.h"
#include "rccl.h"
#endif // XGBOOST_USE_NCCL #endif // XGBOOST_USE_NCCL
#include "../common/device_helpers.cuh" #include "../common/device_helpers.cuh"
#include "coll.h" #include "coll.h"
@ -17,7 +20,11 @@ inline Result GetCUDAResult(cudaError rc) {
if (rc == cudaSuccess) { if (rc == cudaSuccess) {
return Success(); return Success();
} }
#if defined(XGBOOST_USE_NCCL)
std::string msg = thrust::system_error(rc, thrust::cuda_category()).what(); std::string msg = thrust::system_error(rc, thrust::cuda_category()).what();
#elif defined(XGBOOST_USE_RCCL)
std::string msg = thrust::system_error(rc, thrust::hip_category()).what();
#endif
return Fail(msg); return Fail(msg);
} }

4
src/collective/comm.hip Normal file
View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "comm.cu"
#endif

View File

@ -0,0 +1,4 @@
#pragma once
#include "comm.cuh"

View File

@ -6,7 +6,9 @@
#if defined(XGBOOST_USE_HIP) #if defined(XGBOOST_USE_HIP)
#define cudaSuccess hipSuccess #define cudaSuccess hipSuccess
#define cudaError hipError_t
#define cudaGetLastError hipGetLastError #define cudaGetLastError hipGetLastError
#define cudaPeekAtLastError hipPeekAtLastError
#define cudaStream_t hipStream_t #define cudaStream_t hipStream_t
#define cudaStreamCreate hipStreamCreate #define cudaStreamCreate hipStreamCreate

View File

@ -1,7 +1,7 @@
/** /**
* Copyright 2023, XGBoost Contributors * Copyright 2023, XGBoost Contributors
*/ */
#if defined(XGBOOST_USE_NCCL) #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thrust/device_vector.h> // for device_vector #include <thrust/device_vector.h> // for device_vector
#include <thrust/equal.h> // for equal #include <thrust/equal.h> // for equal

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "test_allgather.cu"
#endif

View File

@ -1,7 +1,7 @@
/** /**
* Copyright 2023, XGBoost Contributors * Copyright 2023, XGBoost Contributors
*/ */
#if defined(XGBOOST_USE_NCCL) #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thrust/host_vector.h> // for host_vector #include <thrust/host_vector.h> // for host_vector

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "test_allreduce.cu"
#endif

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "test_transform_range.cu"
#endif

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "test_gblinear.cu"
#endif

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "test_gbtree.cu"
#endif

View File

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "test_expand_entry.cu"
#endif