Support CUDA f16 without transformation. (#9207)
- Support f16 from cupy. - Include CUDA header explicitly. - Cleanup cmake nvtx support.
This commit is contained in:
parent
6f83d9c69a
commit
097f11b6e0
@ -149,6 +149,8 @@ if (USE_CUDA)
|
||||
set(GEN_CODE "")
|
||||
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
||||
|
||||
@ -124,13 +124,6 @@ function(format_gencode_flags flags out)
|
||||
endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
|
||||
endfunction(format_gencode_flags flags)
|
||||
|
||||
macro(enable_nvtx target)
|
||||
find_package(NVTX REQUIRED)
|
||||
target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
|
||||
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||
endmacro()
|
||||
|
||||
# Set CUDA related flags to target. Must be used after code `format_gencode_flags`.
|
||||
function(xgboost_set_cuda_flags target)
|
||||
target_compile_options(${target} PRIVATE
|
||||
@ -162,11 +155,14 @@ function(xgboost_set_cuda_flags target)
|
||||
endif (USE_DEVICE_DEBUG)
|
||||
|
||||
if (USE_NVTX)
|
||||
enable_nvtx(${target})
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||
endif (USE_NVTX)
|
||||
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
|
||||
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap)
|
||||
target_include_directories(
|
||||
${target} PRIVATE
|
||||
${xgboost_SOURCE_DIR}/gputreeshap
|
||||
${CUDAToolkit_INCLUDE_DIRS})
|
||||
|
||||
if (MSVC)
|
||||
target_compile_options(${target} PRIVATE
|
||||
@ -289,7 +285,7 @@ macro(xgboost_target_link_libraries target)
|
||||
endif (USE_NCCL)
|
||||
|
||||
if (USE_NVTX)
|
||||
enable_nvtx(${target})
|
||||
target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
|
||||
endif (USE_NVTX)
|
||||
|
||||
if (RABIT_BUILD_MPI)
|
||||
|
||||
@ -1,26 +0,0 @@
|
||||
if (NVTX_LIBRARY)
|
||||
unset(NVTX_LIBRARY CACHE)
|
||||
endif (NVTX_LIBRARY)
|
||||
|
||||
set(NVTX_LIB_NAME nvToolsExt)
|
||||
|
||||
|
||||
find_path(NVTX_INCLUDE_DIR
|
||||
NAMES nvToolsExt.h
|
||||
PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
|
||||
|
||||
|
||||
find_library(NVTX_LIBRARY
|
||||
NAMES nvToolsExt
|
||||
PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
|
||||
|
||||
message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(NVTX DEFAULT_MSG
|
||||
NVTX_INCLUDE_DIR NVTX_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
NVTX_INCLUDE_DIR
|
||||
NVTX_LIBRARY
|
||||
)
|
||||
@ -882,7 +882,7 @@ def _transform_cupy_array(data: DataType) -> CupyT:
|
||||
|
||||
if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"):
|
||||
data = cupy.array(data, copy=False)
|
||||
if data.dtype.hasobject or data.dtype in [cupy.float16, cupy.bool_]:
|
||||
if data.dtype.hasobject or data.dtype in [cupy.bool_]:
|
||||
data = data.astype(cupy.float32, copy=False)
|
||||
return data
|
||||
|
||||
|
||||
@ -26,6 +26,10 @@
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include "cuda_fp16.h"
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
// Common errors in parsing columnar format.
|
||||
struct ArrayInterfaceErrors {
|
||||
@ -304,12 +308,12 @@ class ArrayInterfaceHandler {
|
||||
template <typename T, typename E = void>
|
||||
struct ToDType;
|
||||
// float
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
template <>
|
||||
struct ToDType<__half> {
|
||||
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
|
||||
};
|
||||
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
template <>
|
||||
struct ToDType<float> {
|
||||
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
|
||||
@ -459,11 +463,11 @@ class ArrayInterface {
|
||||
CHECK(sizeof(long double) == 16) << error::NoF128();
|
||||
type = T::kF16;
|
||||
} else if (typestr[1] == 'f' && typestr[2] == '2') {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
type = T::kF2;
|
||||
#else
|
||||
LOG(FATAL) << "Half type is not supported.";
|
||||
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
} else if (typestr[1] == 'f' && typestr[2] == '4') {
|
||||
type = T::kF4;
|
||||
} else if (typestr[1] == 'f' && typestr[2] == '8') {
|
||||
@ -490,20 +494,17 @@ class ArrayInterface {
|
||||
}
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
|
||||
XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
|
||||
[[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; }
|
||||
[[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; }
|
||||
|
||||
template <typename Fn>
|
||||
XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
|
||||
using T = ArrayInterfaceHandler::Type;
|
||||
switch (type) {
|
||||
case T::kF2: {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
return func(reinterpret_cast<__half const *>(data));
|
||||
#else
|
||||
SPAN_CHECK(false);
|
||||
return func(reinterpret_cast<float const *>(data));
|
||||
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
case T::kF4:
|
||||
return func(reinterpret_cast<float const *>(data));
|
||||
@ -540,23 +541,23 @@ class ArrayInterface {
|
||||
return func(reinterpret_cast<uint64_t const *>(data));
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE std::size_t ElementSize() const {
|
||||
[[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const {
|
||||
return this->DispatchCall([](auto *typed_data_ptr) {
|
||||
return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
|
||||
});
|
||||
}
|
||||
XGBOOST_DEVICE std::size_t ElementAlignment() const {
|
||||
[[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
|
||||
return this->DispatchCall([](auto *typed_data_ptr) {
|
||||
return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T = float, typename... Index>
|
||||
XGBOOST_DEVICE T operator()(Index &&...index) const {
|
||||
XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const {
|
||||
static_assert(sizeof...(index) <= D, "Invalid index.");
|
||||
return this->DispatchCall([=](auto const *p_values) -> T {
|
||||
std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
// No operator defined for half -> size_t
|
||||
using Type = std::conditional_t<
|
||||
std::is_same<__half,
|
||||
@ -566,7 +567,7 @@ class ArrayInterface {
|
||||
return static_cast<T>(static_cast<Type>(p_values[offset]));
|
||||
#else
|
||||
return static_cast<T>(p_values[offset]);
|
||||
#endif
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
});
|
||||
}
|
||||
|
||||
@ -603,7 +604,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
|
||||
};
|
||||
switch (array.type) {
|
||||
case ArrayInterfaceHandler::kF2: {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dispatch(__half{});
|
||||
#endif
|
||||
break;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user