fix uuid and Clear/SetValid

This commit is contained in:
Hui Liu 2023-10-23 16:32:26 -07:00
parent 55994b1ac7
commit 6ba66463b6
8 changed files with 44 additions and 17 deletions

View File

@ -200,6 +200,18 @@ macro(xgboost_link_nccl target)
endif() endif()
endmacro() endmacro()
macro(xgboost_link_rccl target)
if(BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${rccl_INCLUDE_DIR})
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1)
target_link_libraries(${target} PUBLIC ${rccl_LIBRARY})
else()
target_include_directories(${target} PRIVATE ${rccl_INCLUDE_DIR})
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1)
target_link_libraries(${target} PRIVATE ${rccl_LIBRARY})
endif()
endmacro()
# compile options # compile options
macro(xgboost_target_properties target) macro(xgboost_target_properties target)
set_target_properties(${target} PROPERTIES set_target_properties(${target} PROPERTIES
@ -302,6 +314,10 @@ macro(xgboost_target_link_libraries target)
xgboost_link_nccl(${target}) xgboost_link_nccl(${target})
endif() endif()
if(USE_RCCL)
xgboost_link_rccl(${target})
endif()
if(USE_NVTX) if(USE_NVTX)
target_link_libraries(${target} PRIVATE CUDA::nvToolsExt) target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
endif() endif()

View File

@ -37,21 +37,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
private: private:
static constexpr std::size_t kUuidLength = static constexpr std::size_t kUuidLength =
#if defined(XGBOOST_USE_HIP) #if defined(XGBOOST_USE_HIP)
sizeof(std::declval<hipDeviceProp>().uuid) / sizeof(uint64_t); sizeof(hipUUID) / sizeof(uint64_t);
#else #elif defined(XGBOOST_USE_CUDA)
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t); sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
#endif #endif
void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const { void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
#if defined(XGBOOST_USE_HIP) #if defined(XGBOOST_USE_HIP)
hipDeviceProp prob{}; hipUUID id;
dh::safe_cuda(hipGetDeviceProperties(&prob, device_ordinal_)); hipDeviceGetUuid(&id, device_ordinal_);
#else std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
#elif defined(XGBOOST_USE_CUDA)
cudaDeviceProp prob{}; cudaDeviceProp prob{};
dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_)); dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
#endif
std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid)); std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
#endif
} }
static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) { static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) {

View File

@ -162,6 +162,16 @@ struct BitFieldContainer {
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type; using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit); atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
} }
/* compiler hack */
#if defined(__HIP_PLATFORM_AMD__)
void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
value &= clear_bit;
}
#endif
#else #else
void Set(index_type pos) noexcept(true) { void Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos)); Pos pos_v = Direction::Shift(ToBitPos(pos));

View File

@ -173,7 +173,7 @@ class ColumnMatrix {
this->InitView(); this->InitView();
} }
/** @brief Set the i^th element to be a valid element (instead of missing). */ /** @brief Set the i^th element to be a valid element (instead of missing). */
void SetValid(typename LBitField32::index_type i) { /*missing.Clear(i); */} void SetValid(typename LBitField32::index_type i) {missing.Clear(i);}
/** @brief assign the storage to the view. */ /** @brief assign the storage to the view. */
void InitView() { void InitView() {
missing = LBitField32{Span{storage.data(), storage.size()}}; missing = LBitField32{Span{storage.data(), storage.size()}};

View File

@ -109,7 +109,7 @@ inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int li
if (code == ncclUnhandledCudaError) { if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details. // nccl usually preserves the last error so we can get more details.
auto err = hipPeekAtLastError(); auto err = hipPeekAtLastError();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; ss << " CUDA error: " << thrust::system_error(err, thrust::hip_category()).what() << "\n";
} else if (code == ncclSystemError) { } else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying " ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for RCCL via environment variables listed in its reference: " "the network interface for RCCL via environment variables listed in its reference: "

View File

@ -328,7 +328,7 @@ template <>
struct ToDType<__half> { struct ToDType<__half> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2; static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
}; };
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) #endif // defined(XGBOOST_USE_CUDA)
template <> template <>
struct ToDType<float> { struct ToDType<float> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4; static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@ -377,10 +377,10 @@ struct ToDType<int64_t> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8; static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
}; };
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) #if !defined(XGBOOST_USE_CUDA) && !defined(__HIP_PLATFORM_AMD__)
inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); } inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; } inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) #endif // !defined(XGBOOST_USE_CUDA)
/** /**
* \brief A type erased view over __array_interface__ protocol defined by numpy * \brief A type erased view over __array_interface__ protocol defined by numpy
@ -482,7 +482,7 @@ class ArrayInterface {
type = T::kF2; type = T::kF2;
#else #else
LOG(FATAL) << "Half type is not supported."; LOG(FATAL) << "Half type is not supported.";
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) #endif // defined(XGBOOST_USE_CUDA)
} else if (typestr[1] == 'f' && typestr[2] == '4') { } else if (typestr[1] == 'f' && typestr[2] == '4') {
type = T::kF4; type = T::kF4;
} else if (typestr[1] == 'f' && typestr[2] == '8') { } else if (typestr[1] == 'f' && typestr[2] == '8') {
@ -519,7 +519,7 @@ class ArrayInterface {
case T::kF2: { case T::kF2: {
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) #if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
return func(reinterpret_cast<__half const *>(data)); return func(reinterpret_cast<__half const *>(data));
#endif // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__) #endif // defined(XGBOOST_USE_CUDA)
} }
case T::kF4: case T::kF4:
return func(reinterpret_cast<float const *>(data)); return func(reinterpret_cast<float const *>(data));
@ -582,7 +582,7 @@ class ArrayInterface {
return static_cast<T>(static_cast<Type>(p_values[offset])); return static_cast<T>(static_cast<Type>(p_values[offset]));
#else #else
return static_cast<T>(p_values[offset]); return static_cast<T>(p_values[offset]);
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__) #endif // defined(XGBOOST_USE_CUDA)
}); });
} }

View File

@ -1478,11 +1478,11 @@ class LearnerImpl : public LearnerIO {
private: private:
void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) { std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
#if defined(XGBOOST_USE_CUDA) #ifndef XGBOOST_USE_HIP
out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength()); out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
collective::ApplyWithLabels(info, out_gpair->Data(), collective::ApplyWithLabels(info, out_gpair->Data(),
[&] { obj_->GetGradient(preds, info, iter, out_gpair); }); [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
#elif defined(XGBOOST_USE_HIP) #else
if (info.IsVerticalFederated()) { if (info.IsVerticalFederated()) {
out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength()); out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
collective::ApplyWithLabels(info, out_gpair->Data(), collective::ApplyWithLabels(info, out_gpair->Data(),

View File

@ -15,6 +15,7 @@
#include "../../../src/collective/communicator-inl.hip.h" #include "../../../src/collective/communicator-inl.hip.h"
#include "../../../src/collective/nccl_device_communicator.hip.h" #include "../../../src/collective/nccl_device_communicator.hip.h"
#endif #endif
#include "../helpers.h"
namespace xgboost { namespace xgboost {
namespace collective { namespace collective {