fix uuid and Clear/SetValid

This commit is contained in:
Hui Liu 2023-10-23 16:32:26 -07:00
parent 55994b1ac7
commit 6ba66463b6
8 changed files with 44 additions and 17 deletions

View File

@ -200,6 +200,18 @@ macro(xgboost_link_nccl target)
endif()
endmacro()
macro(xgboost_link_rccl target)
if(BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${rccl_INCLUDE_DIR})
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1)
target_link_libraries(${target} PUBLIC ${rccl_LIBRARY})
else()
target_include_directories(${target} PRIVATE ${rccl_INCLUDE_DIR})
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1)
target_link_libraries(${target} PRIVATE ${rccl_LIBRARY})
endif()
endmacro()
# compile options
macro(xgboost_target_properties target)
set_target_properties(${target} PROPERTIES
@ -302,6 +314,10 @@ macro(xgboost_target_link_libraries target)
xgboost_link_nccl(${target})
endif()
if(USE_RCCL)
xgboost_link_rccl(${target})
endif()
if(USE_NVTX)
target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
endif()

View File

@ -37,21 +37,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
private:
static constexpr std::size_t kUuidLength =
#if defined(XGBOOST_USE_HIP)
sizeof(std::declval<hipDeviceProp>().uuid) / sizeof(uint64_t);
#else
sizeof(hipUUID) / sizeof(uint64_t);
#elif defined(XGBOOST_USE_CUDA)
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
#endif
void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
#if defined(XGBOOST_USE_HIP)
hipDeviceProp prob{};
dh::safe_cuda(hipGetDeviceProperties(&prob, device_ordinal_));
#else
hipUUID id;
hipDeviceGetUuid(&id, device_ordinal_);
std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
#elif defined(XGBOOST_USE_CUDA)
cudaDeviceProp prob{};
dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
#endif
std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
#endif
}
static std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) {

View File

@ -162,6 +162,16 @@ struct BitFieldContainer {
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
}
/* compiler hack */
#if defined(__HIP_PLATFORM_AMD__)
void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
value &= clear_bit;
}
#endif
#else
void Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));

View File

@ -173,7 +173,7 @@ class ColumnMatrix {
this->InitView();
}
/** @brief Set the i^th element to be a valid element (instead of missing). */
void SetValid(typename LBitField32::index_type i) { /*missing.Clear(i); */}
void SetValid(typename LBitField32::index_type i) {missing.Clear(i);}
/** @brief assign the storage to the view. */
void InitView() {
missing = LBitField32{Span{storage.data(), storage.size()}};

View File

@ -109,7 +109,7 @@ inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int li
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = hipPeekAtLastError();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
ss << " CUDA error: " << thrust::system_error(err, thrust::hip_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for RCCL via environment variables listed in its reference: "

View File

@ -328,7 +328,7 @@ template <>
struct ToDType<__half> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
};
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA)
template <>
struct ToDType<float> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@ -377,10 +377,10 @@ struct ToDType<int64_t> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
};
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
#if !defined(XGBOOST_USE_CUDA) && !defined(__HIP_PLATFORM_AMD__)
inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
#endif // !defined(XGBOOST_USE_CUDA)
/**
* \brief A type erased view over __array_interface__ protocol defined by numpy
@ -482,7 +482,7 @@ class ArrayInterface {
type = T::kF2;
#else
LOG(FATAL) << "Half type is not supported.";
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA)
} else if (typestr[1] == 'f' && typestr[2] == '4') {
type = T::kF4;
} else if (typestr[1] == 'f' && typestr[2] == '8') {
@ -519,7 +519,7 @@ class ArrayInterface {
case T::kF2: {
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
return func(reinterpret_cast<__half const *>(data));
#endif // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA)
}
case T::kF4:
return func(reinterpret_cast<float const *>(data));
@ -582,7 +582,7 @@ class ArrayInterface {
return static_cast<T>(static_cast<Type>(p_values[offset]));
#else
return static_cast<T>(p_values[offset]);
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA)
});
}

View File

@ -1478,11 +1478,11 @@ class LearnerImpl : public LearnerIO {
private:
void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
#if defined(XGBOOST_USE_CUDA)
#ifndef XGBOOST_USE_HIP
out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
collective::ApplyWithLabels(info, out_gpair->Data(),
[&] { obj_->GetGradient(preds, info, iter, out_gpair); });
#elif defined(XGBOOST_USE_HIP)
#else
if (info.IsVerticalFederated()) {
out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
collective::ApplyWithLabels(info, out_gpair->Data(),

View File

@ -15,6 +15,7 @@
#include "../../../src/collective/communicator-inl.hip.h"
#include "../../../src/collective/nccl_device_communicator.hip.h"
#endif
#include "../helpers.h"
namespace xgboost {
namespace collective {