enable coll and comm
This commit is contained in:
parent
b6b5218245
commit
02f5464fa6
4
src/collective/aggregator.hip.h
Normal file
4
src/collective/aggregator.hip.h
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "aggregator.cuh"
|
||||||
@ -10,7 +10,11 @@
|
|||||||
#include "allgather.h" // for AllgatherVOffset
|
#include "allgather.h" // for AllgatherVOffset
|
||||||
#include "coll.cuh"
|
#include "coll.cuh"
|
||||||
#include "comm.cuh"
|
#include "comm.cuh"
|
||||||
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
#elif defined(XGBOOST_USE_RCCL)
|
||||||
|
#include "rccl.h"
|
||||||
|
#endif
|
||||||
#include "xgboost/collective/result.h" // for Result
|
#include "xgboost/collective/result.h" // for Result
|
||||||
#include "xgboost/span.h" // for Span
|
#include "xgboost/span.h" // for Span
|
||||||
|
|
||||||
@ -29,7 +33,11 @@ Result GetNCCLResult(ncclResult_t code) {
|
|||||||
if (code == ncclUnhandledCudaError) {
|
if (code == ncclUnhandledCudaError) {
|
||||||
// nccl usually preserves the last error so we can get more details.
|
// nccl usually preserves the last error so we can get more details.
|
||||||
auto err = cudaPeekAtLastError();
|
auto err = cudaPeekAtLastError();
|
||||||
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||||
|
#elif defined(XGBOOST_USE_RCCL)
|
||||||
|
ss << " CUDA error: " << thrust::system_error(err, thrust::hip_category()).what() << "\n";
|
||||||
|
#endif
|
||||||
} else if (code == ncclSystemError) {
|
} else if (code == ncclSystemError) {
|
||||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||||
"the network interface for NCCL via environment variables listed in its reference: "
|
"the network interface for NCCL via environment variables listed in its reference: "
|
||||||
|
|||||||
4
src/collective/coll.hip
Normal file
4
src/collective/coll.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "coll.cu"
|
||||||
|
#endif
|
||||||
4
src/collective/coll.hip.h
Normal file
4
src/collective/coll.hip.h
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "coll.cuh"
|
||||||
@ -36,12 +36,22 @@ Result GetUniqueId(Comm const& comm, ncclUniqueId* pid) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline constexpr std::size_t kUuidLength =
|
inline constexpr std::size_t kUuidLength =
|
||||||
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t);
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
|
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(std::uint64_t);
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
sizeof(hipUUID) / sizeof(uint64_t);
|
||||||
|
#endif
|
||||||
|
|
||||||
void GetCudaUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid, DeviceOrd device) {
|
void GetCudaUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid, DeviceOrd device) {
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
cudaDeviceProp prob{};
|
cudaDeviceProp prob{};
|
||||||
dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal));
|
dh::safe_cuda(cudaGetDeviceProperties(&prob, device.ordinal));
|
||||||
std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid));
|
std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
hipUUID id;
|
||||||
|
hipDeviceGetUuid(&id, device.ordinal);
|
||||||
|
std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid) {
|
static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> const& uuid) {
|
||||||
|
|||||||
@ -3,8 +3,11 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
|
#elif defined(XGBOOST_USE_RCCL)
|
||||||
|
#include "../common/cuda_to_hip.h"
|
||||||
|
#include "rccl.h"
|
||||||
#endif // XGBOOST_USE_NCCL
|
#endif // XGBOOST_USE_NCCL
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
#include "coll.h"
|
#include "coll.h"
|
||||||
@ -17,7 +20,11 @@ inline Result GetCUDAResult(cudaError rc) {
|
|||||||
if (rc == cudaSuccess) {
|
if (rc == cudaSuccess) {
|
||||||
return Success();
|
return Success();
|
||||||
}
|
}
|
||||||
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
std::string msg = thrust::system_error(rc, thrust::cuda_category()).what();
|
std::string msg = thrust::system_error(rc, thrust::cuda_category()).what();
|
||||||
|
#elif defined(XGBOOST_USE_RCCL)
|
||||||
|
std::string msg = thrust::system_error(rc, thrust::hip_category()).what();
|
||||||
|
#endif
|
||||||
return Fail(msg);
|
return Fail(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4
src/collective/comm.hip
Normal file
4
src/collective/comm.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "comm.cu"
|
||||||
|
#endif
|
||||||
4
src/collective/comm.hip.h
Normal file
4
src/collective/comm.hip.h
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "comm.cuh"
|
||||||
@ -6,7 +6,9 @@
|
|||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
|
||||||
#define cudaSuccess hipSuccess
|
#define cudaSuccess hipSuccess
|
||||||
|
#define cudaError hipError_t
|
||||||
#define cudaGetLastError hipGetLastError
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#define cudaPeekAtLastError hipPeekAtLastError
|
||||||
|
|
||||||
#define cudaStream_t hipStream_t
|
#define cudaStream_t hipStream_t
|
||||||
#define cudaStreamCreate hipStreamCreate
|
#define cudaStreamCreate hipStreamCreate
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2023, XGBoost Contributors
|
* Copyright 2023, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#if defined(XGBOOST_USE_NCCL)
|
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <thrust/device_vector.h> // for device_vector
|
#include <thrust/device_vector.h> // for device_vector
|
||||||
#include <thrust/equal.h> // for equal
|
#include <thrust/equal.h> // for equal
|
||||||
|
|||||||
4
tests/cpp/collective/test_allgather.hip
Normal file
4
tests/cpp/collective/test_allgather.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "test_allgather.cu"
|
||||||
|
#endif
|
||||||
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2023, XGBoost Contributors
|
* Copyright 2023, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#if defined(XGBOOST_USE_NCCL)
|
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <thrust/host_vector.h> // for host_vector
|
#include <thrust/host_vector.h> // for host_vector
|
||||||
|
|
||||||
|
|||||||
4
tests/cpp/collective/test_allreduce.hip
Normal file
4
tests/cpp/collective/test_allreduce.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "test_allreduce.cu"
|
||||||
|
#endif
|
||||||
4
tests/cpp/common/test_transform_range.hip
Normal file
4
tests/cpp/common/test_transform_range.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "test_transform_range.cu"
|
||||||
|
#endif
|
||||||
4
tests/cpp/gbm/test_gblinear.hip
Normal file
4
tests/cpp/gbm/test_gblinear.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "test_gblinear.cu"
|
||||||
|
#endif
|
||||||
4
tests/cpp/gbm/test_gbtree.hip
Normal file
4
tests/cpp/gbm/test_gbtree.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "test_gbtree.cu"
|
||||||
|
#endif
|
||||||
0
tests/cpp/test_context.hip
Normal file
0
tests/cpp/test_context.hip
Normal file
4
tests/cpp/tree/gpu_hist/test_expand_entry.hip
Normal file
4
tests/cpp/tree/gpu_hist/test_expand_entry.hip
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "test_expand_entry.cu"
|
||||||
|
#endif
|
||||||
Loading…
x
Reference in New Issue
Block a user