diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index b3486ec5e..ca5c522e1 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -200,6 +200,18 @@ macro(xgboost_link_nccl target) endif() endmacro() +macro(xgboost_link_rccl target) + if(BUILD_STATIC_LIB) + target_include_directories(${target} PUBLIC ${rccl_INCLUDE_DIR}) + target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1) + target_link_libraries(${target} PUBLIC ${rccl_LIBRARY}) + else() + target_include_directories(${target} PRIVATE ${rccl_INCLUDE_DIR}) + target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1) + target_link_libraries(${target} PRIVATE ${rccl_LIBRARY}) + endif() +endmacro() + # compile options macro(xgboost_target_properties target) set_target_properties(${target} PROPERTIES @@ -302,6 +314,10 @@ macro(xgboost_target_link_libraries target) xgboost_link_nccl(${target}) endif() + if(USE_RCCL) + xgboost_link_rccl(${target}) + endif() + if(USE_NVTX) target_link_libraries(${target} PRIVATE CUDA::nvToolsExt) endif() diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh index 15300a6e2..b1e903821 100644 --- a/src/collective/nccl_device_communicator.cuh +++ b/src/collective/nccl_device_communicator.cuh @@ -37,21 +37,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator { private: static constexpr std::size_t kUuidLength = #if defined(XGBOOST_USE_HIP) - sizeof(std::declval().uuid) / sizeof(uint64_t); -#else + sizeof(hipUUID) / sizeof(uint64_t); +#elif defined(XGBOOST_USE_CUDA) sizeof(std::declval().uuid) / sizeof(uint64_t); #endif void GetCudaUUID(xgboost::common::Span const &uuid) const { #if defined(XGBOOST_USE_HIP) - hipDeviceProp prob{}; - dh::safe_cuda(hipGetDeviceProperties(&prob, device_ordinal_)); -#else + hipUUID id; + hipDeviceGetUuid(&id, device_ordinal_); + std::memcpy(uuid.data(), static_cast(&id), sizeof(id)); +#elif defined(XGBOOST_USE_CUDA) cudaDeviceProp prob{}; dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_)); -#endif - std::memcpy(uuid.data(), static_cast(&(prob.uuid)), sizeof(prob.uuid)); +#endif } static std::string PrintUUID(xgboost::common::Span const &uuid) { diff --git a/src/common/bitfield.h b/src/common/bitfield.h index 511769e63..8dbc7ed66 100644 --- a/src/common/bitfield.h +++ b/src/common/bitfield.h @@ -162,6 +162,16 @@ struct BitFieldContainer { using Type = typename dh::detail::AtomicDispatcher::Type; atomicAnd(reinterpret_cast(&value), clear_bit); } + + /* compiler hack */ +#if defined(__HIP_PLATFORM_AMD__) + void Clear(index_type pos) noexcept(true) { + Pos pos_v = Direction::Shift(ToBitPos(pos)); + value_type& value = Data()[pos_v.int_pos]; + value_type clear_bit = ~(kOne << pos_v.bit_pos); + value &= clear_bit; + } +#endif #else void Set(index_type pos) noexcept(true) { Pos pos_v = Direction::Shift(ToBitPos(pos)); diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 38784ca9e..cee6c405c 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -173,7 +173,7 @@ class ColumnMatrix { this->InitView(); } /** @brief Set the i^th element to be a valid element (instead of missing). */ - void SetValid(typename LBitField32::index_type i) { /*missing.Clear(i); */} + void SetValid(typename LBitField32::index_type i) {missing.Clear(i);} /** @brief assign the storage to the view. */ void InitView() { missing = LBitField32{Span{storage.data(), storage.size()}}; diff --git a/src/common/device_helpers.hip.h b/src/common/device_helpers.hip.h index 2852155d4..437d35bc6 100644 --- a/src/common/device_helpers.hip.h +++ b/src/common/device_helpers.hip.h @@ -109,7 +109,7 @@ inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int li if (code == ncclUnhandledCudaError) { // nccl usually preserves the last error so we can get more details. auto err = hipPeekAtLastError(); - ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; + ss << " CUDA error: " << thrust::system_error(err, thrust::hip_category()).what() << "\n"; } else if (code == ncclSystemError) { ss << " This might be caused by a network configuration issue. Please consider specifying " "the network interface for RCCL via environment variables listed in its reference: " diff --git a/src/data/array_interface.h b/src/data/array_interface.h index 53dbc37a1..15aebe609 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -328,7 +328,7 @@ template <> struct ToDType<__half> { static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2; }; -#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) +#endif // defined(XGBOOST_USE_CUDA) template <> struct ToDType { static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4; @@ -377,10 +377,10 @@ struct ToDType { static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8; }; -#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) +#if !defined(XGBOOST_USE_CUDA) && !defined(__HIP_PLATFORM_AMD__) inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); } inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; } -#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) +#endif // !defined(XGBOOST_USE_CUDA) /** * \brief A type erased view over __array_interface__ protocol defined by numpy @@ -482,7 +482,7 @@ class ArrayInterface { type = T::kF2; #else LOG(FATAL) << "Half type is not supported."; -#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) +#endif // defined(XGBOOST_USE_CUDA) } else if (typestr[1] == 'f' && typestr[2] == '4') { type = T::kF4; } else if (typestr[1] == 'f' && typestr[2] == '8') { @@ -519,7 +519,7 @@ class ArrayInterface { case T::kF2: { #if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) return func(reinterpret_cast<__half const *>(data)); -#endif // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__) +#endif // defined(XGBOOST_USE_CUDA) } case T::kF4: return func(reinterpret_cast(data)); @@ -582,7 +582,7 @@ class ArrayInterface { return static_cast(static_cast(p_values[offset])); #else return static_cast(p_values[offset]); -#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) +#endif // defined(XGBOOST_USE_CUDA) }); } diff --git a/src/learner.cc b/src/learner.cc index 5d7c85dd6..8ee901482 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1478,11 +1478,11 @@ class LearnerImpl : public LearnerIO { private: void GetGradient(HostDeviceVector const& preds, MetaInfo const& info, std::int32_t iter, linalg::Matrix* out_gpair) { -#if defined(XGBOOST_USE_CUDA) +#ifndef XGBOOST_USE_HIP out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength()); collective::ApplyWithLabels(info, out_gpair->Data(), [&] { obj_->GetGradient(preds, info, iter, out_gpair); }); -#elif defined(XGBOOST_USE_HIP) +#else if (info.IsVerticalFederated()) { out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength()); collective::ApplyWithLabels(info, out_gpair->Data(), diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu index 1402dee37..c908b3846 100644 --- a/tests/cpp/collective/test_nccl_device_communicator.cu +++ b/tests/cpp/collective/test_nccl_device_communicator.cu @@ -15,6 +15,7 @@ #include "../../../src/collective/communicator-inl.hip.h" #include "../../../src/collective/nccl_device_communicator.hip.h" #endif +#include "../helpers.h" namespace xgboost { namespace collective {