From 097f11b6e0003f700857bc5f225526694ae79752 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 30 May 2023 20:54:31 +0800 Subject: [PATCH] Support CUDA f16 without transformation. (#9207) - Support f16 from cupy. - Include CUDA header explicitly. - Cleanup cmake nvtx support. --- CMakeLists.txt | 2 ++ cmake/Utils.cmake | 16 ++++++---------- cmake/modules/FindNVTX.cmake | 26 ------------------------- python-package/xgboost/data.py | 2 +- src/data/array_interface.h | 35 +++++++++++++++++----------------- 5 files changed, 27 insertions(+), 54 deletions(-) delete mode 100644 cmake/modules/FindNVTX.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 7953a10dd..ede6c5b75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,6 +149,8 @@ if (USE_CUDA) set(GEN_CODE "") format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE) add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap) + + find_package(CUDAToolkit REQUIRED) endif (USE_CUDA) if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 57a45ca42..dc523d03a 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -124,13 +124,6 @@ function(format_gencode_flags flags out) endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") endfunction(format_gencode_flags flags) -macro(enable_nvtx target) - find_package(NVTX REQUIRED) - target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}") - target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}") - target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1) -endmacro() - # Set CUDA related flags to target. Must be used after code `format_gencode_flags`. function(xgboost_set_cuda_flags target) target_compile_options(${target} PRIVATE @@ -162,11 +155,14 @@ function(xgboost_set_cuda_flags target) endif (USE_DEVICE_DEBUG) if (USE_NVTX) - enable_nvtx(${target}) + target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1) endif (USE_NVTX) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1) - target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap) + target_include_directories( + ${target} PRIVATE + ${xgboost_SOURCE_DIR}/gputreeshap + ${CUDAToolkit_INCLUDE_DIRS}) if (MSVC) target_compile_options(${target} PRIVATE @@ -289,7 +285,7 @@ macro(xgboost_target_link_libraries target) endif (USE_NCCL) if (USE_NVTX) - enable_nvtx(${target}) + target_link_libraries(${target} PRIVATE CUDA::nvToolsExt) endif (USE_NVTX) if (RABIT_BUILD_MPI) diff --git a/cmake/modules/FindNVTX.cmake b/cmake/modules/FindNVTX.cmake deleted file mode 100644 index 173e255c8..000000000 --- a/cmake/modules/FindNVTX.cmake +++ /dev/null @@ -1,26 +0,0 @@ -if (NVTX_LIBRARY) - unset(NVTX_LIBRARY CACHE) -endif (NVTX_LIBRARY) - -set(NVTX_LIB_NAME nvToolsExt) - - -find_path(NVTX_INCLUDE_DIR - NAMES nvToolsExt.h - PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include) - - -find_library(NVTX_LIBRARY - NAMES nvToolsExt - PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64) - -message(STATUS "Using nvtx library: ${NVTX_LIBRARY}") - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(NVTX DEFAULT_MSG - NVTX_INCLUDE_DIR NVTX_LIBRARY) - -mark_as_advanced( - NVTX_INCLUDE_DIR - NVTX_LIBRARY -) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 2ebde84f0..5e1a309e0 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -882,7 +882,7 @@ def _transform_cupy_array(data: DataType) -> CupyT: if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"): data = cupy.array(data, copy=False) - if data.dtype.hasobject or data.dtype in [cupy.float16, cupy.bool_]: + if data.dtype.hasobject or data.dtype in [cupy.bool_]: data = data.astype(cupy.float32, copy=False) return data diff --git a/src/data/array_interface.h b/src/data/array_interface.h index fee22203c..1b18f140a 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -26,6 +26,10 @@ #include "xgboost/logging.h" #include "xgboost/span.h" +#if defined(XGBOOST_USE_CUDA) +#include "cuda_fp16.h" +#endif + namespace xgboost { // Common errors in parsing columnar format. struct ArrayInterfaceErrors { @@ -304,12 +308,12 @@ class ArrayInterfaceHandler { template struct ToDType; // float -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(XGBOOST_USE_CUDA) template <> struct ToDType<__half> { static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2; }; -#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#endif // defined(XGBOOST_USE_CUDA) template <> struct ToDType { static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4; @@ -459,11 +463,11 @@ class ArrayInterface { CHECK(sizeof(long double) == 16) << error::NoF128(); type = T::kF16; } else if (typestr[1] == 'f' && typestr[2] == '2') { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(XGBOOST_USE_CUDA) type = T::kF2; #else LOG(FATAL) << "Half type is not supported."; -#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#endif // defined(XGBOOST_USE_CUDA) } else if (typestr[1] == 'f' && typestr[2] == '4') { type = T::kF4; } else if (typestr[1] == 'f' && typestr[2] == '8') { @@ -490,20 +494,17 @@ class ArrayInterface { } } - XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; } - XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; } + [[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; } + [[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; } template XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const { using T = ArrayInterfaceHandler::Type; switch (type) { case T::kF2: { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(XGBOOST_USE_CUDA) return func(reinterpret_cast<__half const *>(data)); -#else - SPAN_CHECK(false); - return func(reinterpret_cast(data)); -#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#endif // defined(XGBOOST_USE_CUDA) } case T::kF4: return func(reinterpret_cast(data)); @@ -540,23 +541,23 @@ class ArrayInterface { return func(reinterpret_cast(data)); } - XGBOOST_DEVICE std::size_t ElementSize() const { + [[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const { return this->DispatchCall([](auto *typed_data_ptr) { return sizeof(std::remove_pointer_t); }); } - XGBOOST_DEVICE std::size_t ElementAlignment() const { + [[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const { return this->DispatchCall([](auto *typed_data_ptr) { return std::alignment_of>::value; }); } template - XGBOOST_DEVICE T operator()(Index &&...index) const { + XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const { static_assert(sizeof...(index) <= D, "Invalid index."); return this->DispatchCall([=](auto const *p_values) -> T { std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(XGBOOST_USE_CUDA) // No operator defined for half -> size_t using Type = std::conditional_t< std::is_same<__half, @@ -566,7 +567,7 @@ class ArrayInterface { return static_cast(static_cast(p_values[offset])); #else return static_cast(p_values[offset]); -#endif +#endif // defined(XGBOOST_USE_CUDA) }); } @@ -603,7 +604,7 @@ void DispatchDType(ArrayInterface const array, std::int32_t device, Fn fn) { }; switch (array.type) { case ArrayInterfaceHandler::kF2: { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(XGBOOST_USE_CUDA) dispatch(__half{}); #endif break;