diff --git a/CMakeLists.txt b/CMakeLists.txt index e93427ed9..bf8f0cf62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,10 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar option(USE_CUDA "Build with GPU acceleration" OFF) option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON) option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) +# This is specifically designed for PyPI binary release and should be disabled for most of the cases. +option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) + if(USE_CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS}) set(GPU_COMPUTE_VER "" CACHE STRING @@ -80,6 +83,7 @@ if(USE_CUDA) unset(GPU_COMPUTE_VER CACHE) endif() endif() + # CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still # be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO # is not a supproted configuration @@ -115,6 +119,12 @@ endif() if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.") endif() +if(USE_DLOPEN_NCCL AND (NOT USE_NCCL)) + message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable USE_DLOPEN_NCCL.") +endif() +if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))) + message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.") +endif() if(JVM_BINDINGS AND R_LIB) message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.") endif() diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index eafd829fc..9c373bb01 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -171,17 +171,24 @@ function(xgboost_set_cuda_flags target) endif() endfunction() -macro(xgboost_link_nccl target) +function(xgboost_link_nccl target) + set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1) + if(USE_DLOPEN_NCCL) + list(APPEND xgboost_nccl_flags -DXGBOOST_USE_DLOPEN_NCCL=1) + endif() + if(BUILD_STATIC_LIB) target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR}) - target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1) + target_compile_definitions(${target} PUBLIC ${xgboost_nccl_flags}) target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY}) else() target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR}) - target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1) - target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY}) + target_compile_definitions(${target} PRIVATE ${xgboost_nccl_flags}) + if(NOT USE_DLOPEN_NCCL) + target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY}) + endif() endif() -endmacro() +endfunction() # compile options macro(xgboost_target_properties target) diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake index 02ee731a1..fa3ed0866 100644 --- a/cmake/modules/FindNccl.cmake +++ b/cmake/modules/FindNccl.cmake @@ -54,17 +54,24 @@ find_path(NCCL_INCLUDE_DIR NAMES nccl.h HINTS ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include) -find_library(NCCL_LIBRARY - NAMES ${NCCL_LIB_NAME} - HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/) +if(USE_DLOPEN_NCCL) + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Nccl DEFAULT_MSG NCCL_INCLUDE_DIR) -message(STATUS "Using nccl library: ${NCCL_LIBRARY}") + mark_as_advanced(NCCL_INCLUDE_DIR) +else() + find_library(NCCL_LIBRARY + NAMES ${NCCL_LIB_NAME} + HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/) -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(Nccl DEFAULT_MSG - NCCL_INCLUDE_DIR NCCL_LIBRARY) + message(STATUS "Using nccl library: ${NCCL_LIBRARY}") -mark_as_advanced( - NCCL_INCLUDE_DIR - NCCL_LIBRARY -) + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Nccl DEFAULT_MSG + NCCL_INCLUDE_DIR NCCL_LIBRARY) + + mark_as_advanced( + NCCL_INCLUDE_DIR + NCCL_LIBRARY + ) +endif() diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index 148230fe6..4b145f9a9 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -536,6 +536,37 @@ Troubleshooting - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message that includes `Multiple processes within a communication group ...` upon initialization. +.. _nccl-load: + +- Starting from version 2.1.0, to reduce the size of the binary wheel, the XGBoost package + (installed using pip) loads NCCL from the environment instead of bundling it + directly. This means that if you encounter an error message like + "Failed to load nccl ...", it indicates that NCCL is not installed or properly + configured in your environment. + + To resolve this issue, you can install NCCL using pip: + + .. code-block:: sh + + pip install nvidia-nccl-cu12 # (or with any compatible CUDA version) + + The default conda installation of XGBoost should not encounter this error. If you are + using a customized XGBoost, please make sure one of the followings is true: + + + XGBoost is NOT compiled with the `USE_DLOPEN_NCCL` flag. + + The `dmlc_nccl_path` parameter is set to full NCCL path when initializing the collective. + + Here are some additional tips for troubleshooting NCCL dependency issues: + + + Check the NCCL installation path and verify that it's installed correctly. We try to + find NCCL by using ``from nvidia.nccl import lib`` in Python when XGBoost is installed + using pip. + + Ensure that you have the correct CUDA version installed. NCCL requires a compatible + CUDA version to function properly. + + If you are not using distributed training with XGBoost and yet see this error, please + open an issue on GitHub. + + If you continue to encounter NCCL dependency issues, please open an issue on GitHub. + ************ IPv6 Support ************ diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index ffa3a6c79..59d4d0881 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -1613,6 +1613,8 @@ XGB_DLL int XGTrackerFree(TrackerHandle handle); * - DMLC_TRACKER_PORT: Port number of the tracker. * - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment. * - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker. + * - dmlc_nccl_path: The path to NCCL shared object. Only used if XGBoost is compiled with + * `USE_DLOPEN_NCCL`. * Only applicable to the Federated communicator (use upper case for environment variables, use * lower case for runtime configuration): * - federated_server_address: Address of the federated server. diff --git a/include/xgboost/string_view.h b/include/xgboost/string_view.h index ba0d9f368..463558363 100644 --- a/include/xgboost/string_view.h +++ b/include/xgboost/string_view.h @@ -1,23 +1,24 @@ /** - * Copyright 2021-2023 by XGBoost Contributors + * Copyright 2021-2023, XGBoost Contributors */ #ifndef XGBOOST_STRING_VIEW_H_ #define XGBOOST_STRING_VIEW_H_ #include // CHECK_LT #include // Span -#include // std::equal,std::min -#include // std::reverse_iterator -#include // std::ostream -#include // std::char_traits,std::string +#include // for equal, min +#include // for size_t +#include // for reverse_iterator +#include // for ostream +#include // for char_traits, string namespace xgboost { struct StringView { private: - using CharT = char; // unsigned char + using CharT = char; using Traits = std::char_traits; CharT const* str_{nullptr}; - size_t size_{0}; + std::size_t size_{0}; public: using value_type = CharT; // NOLINT @@ -28,40 +29,41 @@ struct StringView { public: constexpr StringView() = default; - constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {} + constexpr StringView(value_type const* str, std::size_t size) : str_{str}, size_{size} {} StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} // NOLINT - constexpr StringView(CharT const* str) // NOLINT + constexpr StringView(value_type const* str) // NOLINT : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {} - CharT const& operator[](size_t p) const { return str_[p]; } - CharT const& at(size_t p) const { // NOLINT + [[nodiscard]] value_type const& operator[](std::size_t p) const { return str_[p]; } + [[nodiscard]] explicit operator std::string() const { return {this->c_str(), this->size()}; } + [[nodiscard]] value_type const& at(std::size_t p) const { // NOLINT CHECK_LT(p, size_); return str_[p]; } - constexpr std::size_t size() const { return size_; } // NOLINT - constexpr bool empty() const { return size() == 0; } // NOLINT - StringView substr(size_t beg, size_t n) const { // NOLINT + [[nodiscard]] constexpr std::size_t size() const { return size_; } // NOLINT + [[nodiscard]] constexpr bool empty() const { return size() == 0; } // NOLINT + [[nodiscard]] StringView substr(std::size_t beg, std::size_t n) const { // NOLINT CHECK_LE(beg, size_); - size_t len = std::min(n, size_ - beg); + std::size_t len = std::min(n, size_ - beg); return {str_ + beg, len}; } - CharT const* c_str() const { return str_; } // NOLINT + [[nodiscard]] value_type const* c_str() const { return str_; } // NOLINT - constexpr CharT const* cbegin() const { return str_; } // NOLINT - constexpr CharT const* cend() const { return str_ + size(); } // NOLINT - constexpr CharT const* begin() const { return str_; } // NOLINT - constexpr CharT const* end() const { return str_ + size(); } // NOLINT + [[nodiscard]] constexpr const_iterator cbegin() const { return str_; } // NOLINT + [[nodiscard]] constexpr const_iterator cend() const { return str_ + size(); } // NOLINT + [[nodiscard]] constexpr iterator begin() const { return str_; } // NOLINT + [[nodiscard]] constexpr iterator end() const { return str_ + size(); } // NOLINT - const_reverse_iterator rbegin() const noexcept { // NOLINT + [[nodiscard]] const_reverse_iterator rbegin() const noexcept { // NOLINT return const_reverse_iterator(this->end()); } - const_reverse_iterator crbegin() const noexcept { // NOLINT + [[nodiscard]] const_reverse_iterator crbegin() const noexcept { // NOLINT return const_reverse_iterator(this->end()); } - const_reverse_iterator rend() const noexcept { // NOLINT + [[nodiscard]] const_reverse_iterator rend() const noexcept { // NOLINT return const_reverse_iterator(this->begin()); } - const_reverse_iterator crend() const noexcept { // NOLINT + [[nodiscard]] const_reverse_iterator crend() const noexcept { // NOLINT return const_reverse_iterator(this->begin()); } }; diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 18908fc1c..3692cb13c 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -103,6 +103,7 @@ if __name__ == "__main__": if cli_args.use_cuda == 'ON': CONFIG['USE_CUDA'] = 'ON' CONFIG['USE_NCCL'] = 'ON' + CONFIG["USE_DLOPEN_NCCL"] = "OFF" args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()] diff --git a/plugin/federated/federated_comm.cuh b/plugin/federated/federated_comm.cuh index df9127644..58c52f67e 100644 --- a/plugin/federated/federated_comm.cuh +++ b/plugin/federated/federated_comm.cuh @@ -5,9 +5,11 @@ #include // for shared_ptr +#include "../../src/collective/coll.h" // for Coll #include "../../src/common/device_helpers.cuh" // for CUDAStreamView #include "federated_comm.h" // for FederatedComm #include "xgboost/context.h" // for Context +#include "xgboost/logging.h" namespace xgboost::collective { class CUDAFederatedComm : public FederatedComm { @@ -16,5 +18,9 @@ class CUDAFederatedComm : public FederatedComm { public: explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr impl); [[nodiscard]] auto Stream() const { return stream_; } + Comm* MakeCUDAVar(Context const*, std::shared_ptr) const override { + LOG(FATAL) << "[Internal Error]: Invalid request for CUDA variant."; + return nullptr; + } }; } // namespace xgboost::collective diff --git a/plugin/federated/federated_comm.h b/plugin/federated/federated_comm.h index a24798626..750d94abd 100644 --- a/plugin/federated/federated_comm.h +++ b/plugin/federated/federated_comm.h @@ -10,12 +10,12 @@ #include // for unique_ptr #include // for string -#include "../../src/collective/comm.h" // for Comm +#include "../../src/collective/comm.h" // for HostComm #include "../../src/common/json_utils.h" // for OptionalArg #include "xgboost/json.h" namespace xgboost::collective { -class FederatedComm : public Comm { +class FederatedComm : public HostComm { std::shared_ptr stub_; void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank, @@ -64,6 +64,6 @@ class FederatedComm : public Comm { [[nodiscard]] bool IsFederated() const override { return true; } [[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); } - Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const override; + [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const override; }; } // namespace xgboost::collective diff --git a/python-package/packager/build_config.py b/python-package/packager/build_config.py index 26392a897..d3733d628 100644 --- a/python-package/packager/build_config.py +++ b/python-package/packager/build_config.py @@ -15,6 +15,8 @@ class BuildConfiguration: # pylint: disable=R0902 use_cuda: bool = False # Whether to enable NCCL use_nccl: bool = False + # Whether to load nccl dynamically + use_dlopen_nccl: bool = False # Whether to enable HDFS use_hdfs: bool = False # Whether to enable Azure Storage diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 199e0f06c..3bd642cc7 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -29,7 +29,8 @@ classifiers = [ ] dependencies = [ "numpy", - "scipy" + "scipy", + "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'" ] [project.urls] diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py index 4c67ccbfc..4eb5ea2ab 100644 --- a/python-package/xgboost/collective.py +++ b/python-package/xgboost/collective.py @@ -2,14 +2,15 @@ import ctypes import json import logging +import os import pickle from enum import IntEnum, unique -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import numpy as np from ._typing import _T -from .core import _LIB, _check_call, c_str, from_pystr_to_cstr, py_str +from .core import _LIB, _check_call, build_info, c_str, from_pystr_to_cstr, py_str LOGGER = logging.getLogger("[xgboost.collective]") @@ -250,6 +251,31 @@ class CommunicatorContext: def __init__(self, **args: Any) -> None: self.args = args + key = "dmlc_nccl_path" + if args.get(key, None) is not None: + return + + binfo = build_info() + if not binfo["USE_DLOPEN_NCCL"]: + return + + try: + # PyPI package of NCCL. + from nvidia.nccl import lib + + # There are two versions of nvidia-nccl, one is from PyPI, another one from + # nvidia-pyindex. We support only the first one as the second one is too old + # (2.9.8 as of writing). + if lib.__file__ is not None: + dirname: Optional[str] = os.path.dirname(lib.__file__) + else: + dirname = None + + if dirname: + path = os.path.join(dirname, "libnccl.so.2") + self.args[key] = path + except ImportError: + pass def __enter__(self) -> Dict[str, Any]: init(**self.args) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 648851b31..bfc94aa04 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -184,6 +184,13 @@ def _py_version() -> str: return f.read().strip() +def _register_log_callback(lib: ctypes.CDLL) -> None: + lib.XGBGetLastError.restype = ctypes.c_char_p + lib.callback = _get_log_callback_func() # type: ignore + if lib.XGBRegisterLogCallback(lib.callback) != 0: + raise XGBoostError(lib.XGBGetLastError()) + + def _load_lib() -> ctypes.CDLL: """Load xgboost Library.""" lib_paths = find_lib_path() @@ -228,10 +235,7 @@ Likely causes: Error message(s): {os_error_list} """ ) - lib.XGBGetLastError.restype = ctypes.c_char_p - lib.callback = _get_log_callback_func() # type: ignore - if lib.XGBRegisterLogCallback(lib.callback) != 0: - raise XGBoostError(lib.XGBGetLastError()) + _register_log_callback(lib) def parse(ver: str) -> Tuple[int, int, int]: """Avoid dependency on packaging (PEP 440).""" diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 8975bfb2e..22f03640e 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -7,8 +7,6 @@ #include // for strtoimax #include // for nan #include // for strcmp -#include // for operator<<, basic_ostream, ios, stringstream -#include // for less #include // for numeric_limits #include // for operator!=, _Rb_tree_const_iterator, _Rb_tre... #include // for shared_ptr, allocator, __shared_ptr_access @@ -22,7 +20,6 @@ #include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch... #include "../common/hist_util.h" // for HistogramCuts #include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf... -#include "../common/linalg_op.h" // for ElementWiseTransformHost #include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor #include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte... #include "../data/ellpack_page.h" // for EllpackPage @@ -35,14 +32,12 @@ #include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager #include "dmlc/thread_local.h" // for ThreadLocalStore #include "rabit/c_api.h" // for RabitLinkTag -#include "rabit/rabit.h" // for CheckPoint, LoadCheckPoint #include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat... #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage #include "xgboost/feature_map.h" // for FeatureMap #include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal... #include "xgboost/host_device_vector.h" // for HostDeviceVector -#include "xgboost/intrusive_ptr.h" // for xgboost #include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String #include "xgboost/learner.h" // for Learner, PredictionType #include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ @@ -79,6 +74,7 @@ void XGBBuildInfoDevice(Json *p_info) { info["USE_CUDA"] = Boolean{false}; info["USE_NCCL"] = Boolean{false}; info["USE_RMM"] = Boolean{false}; + info["USE_DLOPEN_NCCL"] = Boolean{false}; } } // namespace xgboost #endif diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu index 84a371558..4ace8b7cc 100644 --- a/src/c_api/c_api.cu +++ b/src/c_api/c_api.cu @@ -33,8 +33,16 @@ void XGBBuildInfoDevice(Json *p_info) { info["USE_NCCL"] = Boolean{true}; v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}}; info["NCCL_VERSION"] = v; + +#if defined(XGBOOST_USE_DLOPEN_NCCL) + info["USE_DLOPEN_NCCL"] = Boolean{true}; +#else + info["USE_DLOPEN_NCCL"] = Boolean{false}; +#endif // defined(XGBOOST_USE_DLOPEN_NCCL) + #else info["USE_NCCL"] = Boolean{false}; + info["USE_DLOPEN_NCCL"] = Boolean{false}; #endif #if defined(XGBOOST_USE_RMM) diff --git a/src/collective/coll.cu b/src/collective/coll.cu index bac9fb094..60072b6a5 100644 --- a/src/collective/coll.cu +++ b/src/collective/coll.cu @@ -19,25 +19,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; } NCCLColl::~NCCLColl() = default; namespace { -Result GetNCCLResult(ncclResult_t code) { - if (code == ncclSuccess) { - return Success(); - } - - std::stringstream ss; - ss << "NCCL failure: " << ncclGetErrorString(code) << "."; - if (code == ncclUnhandledCudaError) { - // nccl usually preserves the last error so we can get more details. - auto err = cudaPeekAtLastError(); - ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; - } else if (code == ncclSystemError) { - ss << " This might be caused by a network configuration issue. Please consider specifying " - "the network interface for NCCL via environment variables listed in its reference: " - "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n"; - } - return Fail(ss.str()); -} - auto GetNCCLType(ArrayInterfaceHandler::Type type) { auto fatal = [] { LOG(FATAL) << "Invalid type for NCCL operation."; @@ -94,11 +75,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span ou common::Span data, Op op) { dh::device_vector buffer(data.size() * pcomm->World()); auto* device_buffer = buffer.data().get(); + auto stub = pcomm->Stub(); // First gather data from all the workers. CHECK(handle); - auto rc = GetNCCLResult( - ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream())); + auto rc = GetNCCLResult(stub, stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8, + handle, pcomm->Stream())); if (!rc.OK()) { return rc; } @@ -149,6 +131,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) { } auto nccl = dynamic_cast(&comm); CHECK(nccl); + auto stub = nccl->Stub(); + return Success() << [&] { if (IsBitwiseOp(op)) { return BitwiseAllReduce(nccl, nccl->Handle(), data, op); @@ -156,9 +140,9 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) { return DispatchDType(type, [=](auto t) { using T = decltype(t); auto rdata = common::RestoreType(data); - auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type), - GetNCCLRedOp(op), nccl->Handle(), nccl->Stream()); - return GetNCCLResult(rc); + auto rc = stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type), + GetNCCLRedOp(op), nccl->Handle(), nccl->Stream()); + return GetNCCLResult(stub, rc); }); } } << [&] { return nccl->Block(); }; @@ -171,9 +155,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) { } auto nccl = dynamic_cast(&comm); CHECK(nccl); + auto stub = nccl->Stub(); + return Success() << [&] { - return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root, - nccl->Handle(), nccl->Stream())); + return GetNCCLResult(stub, stub->Broadcast(data.data(), data.data(), data.size_bytes(), + ncclInt8, root, nccl->Handle(), nccl->Stream())); } << [&] { return nccl->Block(); }; } @@ -184,10 +170,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) { } auto nccl = dynamic_cast(&comm); CHECK(nccl); + auto stub = nccl->Stub(); + auto send = data.subspan(comm.Rank() * size, size); return Success() << [&] { - return GetNCCLResult( - ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream())); + return GetNCCLResult(stub, stub->Allgather(send.data(), data.data(), size, ncclInt8, + nccl->Handle(), nccl->Stream())); } << [&] { return nccl->Block(); }; } @@ -199,19 +187,20 @@ namespace cuda_impl { */ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span data, common::Span sizes, common::Span recv) { - return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] { + auto stub = comm->Stub(); + return Success() << [&stub] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] { std::size_t offset = 0; for (std::int32_t r = 0; r < comm->World(); ++r) { auto as_bytes = sizes[r]; - auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes, - ncclInt8, r, comm->Handle(), dh::DefaultStream()); + auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes, + ncclInt8, r, comm->Handle(), dh::DefaultStream()); if (rc != ncclSuccess) { - return GetNCCLResult(rc); + return GetNCCLResult(stub, rc); } offset += as_bytes; } return Success(); - } << [] { return GetNCCLResult(ncclGroupEnd()); }; + } << [&] { return GetNCCLResult(stub, stub->GroupEnd()); }; } } // namespace cuda_impl @@ -224,10 +213,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span if (!comm.IsDistributed()) { return Success(); } + auto stub = nccl->Stub(); switch (algo) { case AllgatherVAlgo::kRing: { - return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] { + return Success() << [&] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] { // get worker offset detail::AllgatherVOffset(sizes, recv_segments); // copy data @@ -237,8 +227,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span cudaMemcpyDeviceToDevice, nccl->Stream())); } return detail::RingAllgatherV(comm, sizes, recv_segments, recv); - } << [] { - return GetNCCLResult(ncclGroupEnd()); + } << [&] { + return GetNCCLResult(stub, stub->GroupEnd()); } << [&] { return nccl->Block(); }; } case AllgatherVAlgo::kBcast: { diff --git a/src/collective/coll.cuh b/src/collective/coll.cuh index 87fb46711..6ededd101 100644 --- a/src/collective/coll.cuh +++ b/src/collective/coll.cuh @@ -8,7 +8,8 @@ #include "../data/array_interface.h" // for ArrayInterfaceHandler #include "coll.h" // for Coll #include "comm.h" // for Comm -#include "xgboost/span.h" // for Span +#include "nccl_stub.h" +#include "xgboost/span.h" // for Span namespace xgboost::collective { class NCCLColl : public Coll { diff --git a/src/collective/comm.cc b/src/collective/comm.cc index 9da9083f8..783278b65 100644 --- a/src/collective/comm.cc +++ b/src/collective/comm.cc @@ -7,15 +7,12 @@ #include // for seconds #include // for exit #include // for shared_ptr -#include // for unique_lock #include // for string #include // for move, forward #include "../common/common.h" // for AssertGPUSupport -#include "../common/json_utils.h" // for OptionalArg #include "allgather.h" // for RingAllgather #include "protocol.h" // for kMagic -#include "tracker.h" // for GetHostAddress #include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE #include "xgboost/collective/socket.h" // for TCPSocket #include "xgboost/json.h" // for Json, Object @@ -62,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st this->Rank(), this->World()); } -#if !defined(XGBOOST_USE_NCCL) -Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr) const { - common::AssertGPUSupport(); - common::AssertNCCLSupport(); - return nullptr; -} -#endif // !defined(XGBOOST_USE_NCCL) - [[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport, proto::PeerInfo ninfo, std::chrono::seconds timeout, std::int32_t retry, @@ -194,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr) const { } RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, - std::int32_t retry, std::string task_id) - : Comm{std::move(host), port, timeout, retry, std::move(task_id)} { + std::int32_t retry, std::string task_id, StringView nccl_path) + : HostComm{std::move(host), port, timeout, retry, std::move(task_id)}, + nccl_path_{std::move(nccl_path)} { auto rc = this->Bootstrap(timeout_, retry_, task_id_); CHECK(rc.OK()) << rc.Report(); } +#if !defined(XGBOOST_USE_NCCL) +Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr) const { + common::AssertGPUSupport(); + common::AssertNCCLSupport(); + return nullptr; +} +#endif // !defined(XGBOOST_USE_NCCL) + [[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry, std::string task_id) { TCPSocket tracker; diff --git a/src/collective/comm.cu b/src/collective/comm.cu index 09edc522d..cc67def0a 100644 --- a/src/collective/comm.cu +++ b/src/collective/comm.cu @@ -13,19 +13,21 @@ #include "../common/cuda_context.cuh" // for CUDAContext #include "../common/device_helpers.cuh" // for DefaultStream #include "../common/type.h" // for EraseType -#include "broadcast.h" // for Broadcast #include "comm.cuh" // for NCCLComm #include "comm.h" // for Comm +#include "nccl_stub.h" // for NcclStub #include "xgboost/collective/result.h" // for Result #include "xgboost/span.h" // for Span namespace xgboost::collective { namespace { -Result GetUniqueId(Comm const& comm, std::shared_ptr coll, ncclUniqueId* pid) { +Result GetUniqueId(Comm const& comm, std::shared_ptr stub, std::shared_ptr coll, + ncclUniqueId* pid) { static const int kRootRank = 0; ncclUniqueId id; if (comm.Rank() == kRootRank) { - dh::safe_nccl(ncclGetUniqueId(&id)); + auto rc = GetNCCLResult(stub, stub->GetUniqueId(&id)); + CHECK(rc.OK()) << rc.Report(); } auto rc = coll->Broadcast( comm, common::Span{reinterpret_cast(&id), sizeof(ncclUniqueId)}, kRootRank); @@ -54,11 +56,12 @@ static std::string PrintUUID(xgboost::common::Span c } } // namespace -Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const { - return new NCCLComm{ctx, *this, pimpl}; +Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const { + return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}}; } -NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr pimpl) +NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr pimpl, + StringView nccl_path) : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(), root.TaskID()}, stream_{ctx->CUDACtx()->Stream()} { @@ -70,6 +73,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr p } dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); + stub_ = std::make_shared(nccl_path); std::vector uuids(root.World() * kUuidLength, 0); auto s_uuid = xgboost::common::Span{uuids.data(), uuids.size()}; @@ -95,19 +99,24 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr p << "Multiple processes within communication group running on same CUDA " << "device is not supported. " << PrintUUID(s_this_uuid) << "\n"; - rc = GetUniqueId(root, pimpl, &nccl_unique_id_); + rc = std::move(rc) << [&] { + return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_); + } << [&] { + return GetNCCLResult(this->stub_, this->stub_->CommInitRank(&nccl_comm_, root.World(), + nccl_unique_id_, root.Rank())); + }; CHECK(rc.OK()) << rc.Report(); - dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank())); for (std::int32_t r = 0; r < root.World(); ++r) { this->channels_.emplace_back( - std::make_shared(root, r, nccl_comm_, dh::DefaultStream())); + std::make_shared(root, r, nccl_comm_, stub_, dh::DefaultStream())); } } NCCLComm::~NCCLComm() { if (nccl_comm_) { - dh::safe_nccl(ncclCommDestroy(nccl_comm_)); + auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_)); + CHECK(rc.OK()) << rc.Report(); } } } // namespace xgboost::collective diff --git a/src/collective/comm.cuh b/src/collective/comm.cuh index ea15c50f3..ef537b5a9 100644 --- a/src/collective/comm.cuh +++ b/src/collective/comm.cuh @@ -6,9 +6,13 @@ #ifdef XGBOOST_USE_NCCL #include "nccl.h" #endif // XGBOOST_USE_NCCL + +#include // for move + #include "../common/device_helpers.cuh" #include "coll.h" #include "comm.h" +#include "nccl_stub.h" // for NcclStub #include "xgboost/context.h" namespace xgboost::collective { @@ -21,15 +25,20 @@ inline Result GetCUDAResult(cudaError rc) { return Fail(msg); } +#if defined(XGBOOST_USE_NCCL) class NCCLComm : public Comm { ncclComm_t nccl_comm_{nullptr}; + std::shared_ptr stub_; ncclUniqueId nccl_unique_id_{}; dh::CUDAStreamView stream_; + std::string nccl_path_; public: [[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; } + auto Stub() const { return stub_; } - explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr pimpl); + explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr pimpl, + StringView nccl_path); [[nodiscard]] Result LogTracker(std::string) const override { LOG(FATAL) << "Device comm is used for logging."; return Fail("Undefined."); @@ -43,25 +52,53 @@ class NCCLComm : public Comm { } }; +inline Result GetNCCLResult(std::shared_ptr stub, ncclResult_t code) { + if (code == ncclSuccess) { + return Success(); + } + + std::stringstream ss; + ss << "NCCL failure: " << stub->GetErrorString(code) << "."; + if (code == ncclUnhandledCudaError) { + // nccl usually preserves the last error so we can get more details. + auto err = cudaPeekAtLastError(); + ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; + } else if (code == ncclSystemError) { + ss << " This might be caused by a network configuration issue. Please consider specifying " + "the network interface for NCCL via environment variables listed in its reference: " + "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n"; + } + return Fail(ss.str()); +} + class NCCLChannel : public Channel { std::int32_t rank_{-1}; ncclComm_t nccl_comm_{}; + std::shared_ptr stub_; dh::CUDAStreamView stream_; public: explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm, - dh::CUDAStreamView stream) - : rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {} + std::shared_ptr stub, dh::CUDAStreamView stream) + : rank_{rank}, + nccl_comm_{nccl_comm}, + stub_{std::move(stub)}, + Channel{comm, nullptr}, + stream_{stream} {} void SendAll(std::int8_t const* ptr, std::size_t n) override { - dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_)); + auto rc = GetNCCLResult(stub_, stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_)); + CHECK(rc.OK()) << rc.Report(); } void RecvAll(std::int8_t* ptr, std::size_t n) override { - dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_)); + auto rc = GetNCCLResult(stub_, stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_)); + CHECK(rc.OK()) << rc.Report(); } [[nodiscard]] Result Block() override { auto rc = stream_.Sync(false); return GetCUDAResult(rc); } }; + +#endif // defined(XGBOOST_USE_NCCL) } // namespace xgboost::collective diff --git a/src/collective/comm.h b/src/collective/comm.h index 76ab479d7..b2f519e3d 100644 --- a/src/collective/comm.h +++ b/src/collective/comm.h @@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) { return nrank; } +inline StringView DefaultNcclName() { return "libnccl.so.2"; } + class Channel; class Coll; @@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this { [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0; [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); } - - virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const; }; -class RabitComm : public Comm { +/** + * @brief Base class for CPU-based communicator. + */ +class HostComm : public Comm { + public: + using Comm::Comm; + [[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx, + std::shared_ptr pimpl) const = 0; +}; + +class RabitComm : public HostComm { + std::string nccl_path_ = std::string{DefaultNcclName()}; + [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry, std::string task_id); [[nodiscard]] Result Shutdown(); @@ -100,13 +112,15 @@ class RabitComm : public Comm { RabitComm() = default; // ctor for testing where environment is known. RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, - std::int32_t retry, std::string task_id); + std::int32_t retry, std::string task_id, StringView nccl_path); ~RabitComm() noexcept(false) override; [[nodiscard]] bool IsFederated() const override { return false; } [[nodiscard]] Result LogTracker(std::string msg) const override; [[nodiscard]] Result SignalError(Result const&) override; + + [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const override; }; /** diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc index 3d2e24492..f7bbba754 100644 --- a/src/collective/comm_group.cc +++ b/src/collective/comm_group.cc @@ -37,7 +37,7 @@ namespace xgboost::collective { [[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const { if (device.IsCUDA()) { CHECK(ctx->IsCUDA()); - if (!gpu_comm_) { + if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) { gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_)); } return *gpu_comm_; @@ -55,7 +55,6 @@ CommGroup::CommGroup() } std::string type = OptionalArg(config, "dmlc_communicator", std::string{"rabit"}); - std::vector keys; // Try both lower and upper case for compatibility auto get_param = [&](std::string name, auto dft, auto t) { std::string upper; @@ -63,8 +62,6 @@ CommGroup::CommGroup() [](char c) { return std::toupper(c); }); std::transform(name.cbegin(), name.cend(), name.begin(), [](char c) { return std::tolower(c); }); - keys.push_back(upper); - keys.push_back(name); auto const& obj = get(config); auto it = obj.find(upper); @@ -75,19 +72,19 @@ CommGroup::CommGroup() } }; // Common args - auto retry = - OptionalArg(config, "dmlc_retry", static_cast(DefaultRetry())); - auto timeout = OptionalArg(config, "dmlc_timeout_sec", - static_cast(DefaultTimeoutSec())); + auto retry = get_param("dmlc_retry", static_cast(DefaultRetry()), Integer{}); + auto timeout = + get_param("dmlc_timeout_sec", static_cast(DefaultTimeoutSec()), Integer{}); auto task_id = get_param("dmlc_task_id", std::string{}, String{}); if (type == "rabit") { auto host = get_param("dmlc_tracker_uri", std::string{}, String{}); auto port = get_param("dmlc_tracker_port", static_cast(0), Integer{}); + auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{}); auto ptr = new CommGroup{std::shared_ptr{new RabitComm{ // NOLINT host, static_cast(port), std::chrono::seconds{timeout}, - static_cast(retry), task_id}}, + static_cast(retry), task_id, nccl}}, std::shared_ptr(new Coll{})}; // NOLINT return ptr; } else if (type == "federated") { diff --git a/src/collective/comm_group.h b/src/collective/comm_group.h index 62f3e565f..2f6f91d73 100644 --- a/src/collective/comm_group.h +++ b/src/collective/comm_group.h @@ -17,14 +17,16 @@ namespace xgboost::collective { * collective implementations. */ class CommGroup { - std::shared_ptr comm_; + std::shared_ptr comm_; mutable std::shared_ptr gpu_comm_; std::shared_ptr backend_; mutable std::shared_ptr gpu_coll_; // lazy initialization CommGroup(std::shared_ptr comm, std::shared_ptr coll) - : comm_{std::move(comm)}, backend_{std::move(coll)} {} + : comm_{std::dynamic_pointer_cast(comm)}, backend_{std::move(coll)} { + CHECK(comm_); + } public: CommGroup(); diff --git a/src/collective/communicator.cc b/src/collective/communicator.cc index 6ac9ff58e..7fabe50b4 100644 --- a/src/collective/communicator.cc +++ b/src/collective/communicator.cc @@ -3,6 +3,7 @@ */ #include "communicator.h" +#include "comm.h" #include "in_memory_communicator.h" #include "noop_communicator.h" #include "rabit_communicator.h" @@ -14,8 +15,12 @@ namespace xgboost::collective { thread_local std::unique_ptr Communicator::communicator_{new NoOpCommunicator()}; thread_local CommunicatorType Communicator::type_{}; +thread_local std::string Communicator::nccl_path_{}; void Communicator::Init(Json const& config) { + auto nccl = OptionalArg(config, "dmlc_nccl_path", std::string{DefaultNcclName()}); + nccl_path_ = nccl; + auto type = GetTypeFromEnv(); auto const arg = GetTypeFromConfig(config); if (arg != CommunicatorType::kUnknown) { diff --git a/src/collective/communicator.cu b/src/collective/communicator.cu index a80eab6d5..a7552d356 100644 --- a/src/collective/communicator.cu +++ b/src/collective/communicator.cu @@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) { #ifdef XGBOOST_USE_NCCL switch (type_) { case CommunicatorType::kRabit: - device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false)); + device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_)); break; case CommunicatorType::kFederated: case CommunicatorType::kInMemory: device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal)); break; case CommunicatorType::kInMemoryNccl: - device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true)); + device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_)); break; default: - device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false)); + device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_)); } #else device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal)); diff --git a/src/collective/communicator.h b/src/collective/communicator.h index feb446355..b6910b80f 100644 --- a/src/collective/communicator.h +++ b/src/collective/communicator.h @@ -234,6 +234,7 @@ class Communicator { static thread_local std::unique_ptr communicator_; static thread_local CommunicatorType type_; + static thread_local std::string nccl_path_; #if defined(XGBOOST_USE_CUDA) static thread_local std::unique_ptr device_communicator_; #endif diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu index 3d4905cb1..25b198bde 100644 --- a/src/collective/nccl_device_communicator.cu +++ b/src/collective/nccl_device_communicator.cu @@ -2,12 +2,14 @@ * Copyright 2023 XGBoost contributors */ #if defined(XGBOOST_USE_NCCL) +#include "comm.cuh" #include "nccl_device_communicator.cuh" namespace xgboost { namespace collective { -NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync) +NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync, + StringView nccl_path) : device_ordinal_{device_ordinal}, needs_sync_{needs_sync}, world_size_{GetWorldSize()}, @@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy if (world_size_ == 1) { return; } + stub_ = std::make_shared(std::move(nccl_path)); std::vector uuids(world_size_ * kUuidLength, 0); auto s_uuid = xgboost::common::Span{uuids.data(), uuids.size()}; @@ -43,7 +46,9 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy nccl_unique_id_ = GetUniqueId(); dh::safe_cuda(cudaSetDevice(device_ordinal_)); - dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_)); + auto rc = + GetNCCLResult(stub_, stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_)); + CHECK(rc.OK()) << rc.Report(); } NcclDeviceCommunicator::~NcclDeviceCommunicator() { @@ -51,7 +56,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() { return; } if (nccl_comm_) { - dh::safe_nccl(ncclCommDestroy(nccl_comm_)); + auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_)); + CHECK(rc.OK()) << rc.Report(); } if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { LOG(CONSOLE) << "======== NCCL Statistics========"; @@ -137,8 +143,10 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si auto *device_buffer = buffer.data().get(); // First gather data from all the workers. - dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type), + auto rc = GetNCCLResult( + stub_, stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type), nccl_comm_, dh::DefaultStream())); + CHECK(rc.OK()) << rc.Report(); if (needs_sync_) { dh::DefaultStream().Sync(); } @@ -170,9 +178,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co if (IsBitwiseOp(op)) { BitwiseAllReduce(send_receive_buffer, count, data_type, op); } else { - dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count, - GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_, - dh::DefaultStream())); + auto rc = GetNCCLResult(stub_, stub_->Allreduce(send_receive_buffer, send_receive_buffer, count, + GetNcclDataType(data_type), GetNcclRedOp(op), + nccl_comm_, dh::DefaultStream())); + CHECK(rc.OK()) << rc.Report(); } allreduce_bytes_ += count * GetTypeSize(data_type); allreduce_calls_ += 1; @@ -185,8 +194,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu } dh::safe_cuda(cudaSetDevice(device_ordinal_)); - dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_, - dh::DefaultStream())); + auto rc = GetNCCLResult(stub_, stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8, + nccl_comm_, dh::DefaultStream())); + CHECK(rc.OK()) << rc.Report(); } void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes, @@ -206,14 +216,19 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b receive_buffer->resize(total_bytes); size_t offset = 0; - dh::safe_nccl(ncclGroupStart()); - for (int32_t i = 0; i < world_size_; ++i) { - size_t as_bytes = segments->at(i); - dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes, - ncclChar, i, nccl_comm_, dh::DefaultStream())); - offset += as_bytes; - } - dh::safe_nccl(ncclGroupEnd()); + auto rc = Success() << [&] { return GetNCCLResult(stub_, stub_->GroupStart()); } << [&] { + for (int32_t i = 0; i < world_size_; ++i) { + size_t as_bytes = segments->at(i); + auto rc = GetNCCLResult( + stub_, stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes, + ncclChar, i, nccl_comm_, dh::DefaultStream())); + if (!rc.OK()) { + return rc; + } + offset += as_bytes; + } + return Success(); + } << [&] { return GetNCCLResult(stub_, stub_->GroupEnd()); }; } void NcclDeviceCommunicator::Synchronize() { diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh index 084db2046..a194b4ef2 100644 --- a/src/collective/nccl_device_communicator.cuh +++ b/src/collective/nccl_device_communicator.cuh @@ -4,8 +4,10 @@ #pragma once #include "../common/device_helpers.cuh" +#include "comm.cuh" #include "communicator.h" #include "device_communicator.cuh" +#include "nccl_stub.h" namespace xgboost { namespace collective { @@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator { * needed. The in-memory communicator is used in tests with multiple threads, each thread * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks. */ - explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync); + explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path); ~NcclDeviceCommunicator() override; void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type, Operation op) override; @@ -64,7 +66,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator { static const int kRootRank = 0; ncclUniqueId id; if (rank_ == kRootRank) { - dh::safe_nccl(ncclGetUniqueId(&id)); + auto rc = GetNCCLResult(stub_, stub_->GetUniqueId(&id)); + CHECK(rc.OK()) << rc.Report(); } Broadcast(static_cast(&id), sizeof(ncclUniqueId), static_cast(kRootRank)); return id; @@ -78,6 +81,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator { int const world_size_; int const rank_; ncclComm_t nccl_comm_{}; + std::shared_ptr stub_; ncclUniqueId nccl_unique_id_{}; size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated. size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls. diff --git a/src/collective/nccl_stub.cc b/src/collective/nccl_stub.cc new file mode 100644 index 000000000..f4705a46e --- /dev/null +++ b/src/collective/nccl_stub.cc @@ -0,0 +1,109 @@ +/** + * Copyright 2023, XGBoost Contributors + */ +#if defined(XGBOOST_USE_NCCL) +#include "nccl_stub.h" + +#include // for CUDA_VERSION +#include // for dlclose, dlsym, dlopen +#include + +#include // for int32_t +#include // for stringstream +#include // for string +#include // for move + +#include "xgboost/logging.h" + +namespace xgboost::collective { +NcclStub::NcclStub(StringView path) : path_{std::move(path)} { +#if defined(XGBOOST_USE_DLOPEN_NCCL) + CHECK(!path_.empty()) << "Empty path for NCCL."; + + auto cu_major = (CUDA_VERSION) / 1000; + std::stringstream ss; + ss << R"m( + +If XGBoost is installed from PyPI with pip, the error can fixed by: + +- Run `pip install nvidia-nccl-cu)m" + << cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ")."; + ss << R"m( + +Otherwise, please refer to: + + https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting + +for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package +no long bundles NCCL in the binary wheel. + +)m"; + auto help = ss.str(); + std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n "}; + + auto safe_load = [&](auto t, StringView name) { + std::stringstream errs; + auto ptr = reinterpret_cast(dlsym(handle_, name.c_str())); + if (!ptr) { + errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n " + << dlerror() << help; + LOG(FATAL) << errs.str(); + } + return ptr; + }; + + handle_ = dlopen(path_.c_str(), RTLD_LAZY); + if (!handle_) { + LOG(FATAL) << msg << dlerror() << help; + } + + allreduce_ = safe_load(allreduce_, "ncclAllReduce"); + broadcast_ = safe_load(broadcast_, "ncclBroadcast"); + allgather_ = safe_load(allgather_, "ncclAllGather"); + comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank"); + comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy"); + get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId"); + send_ = safe_load(send_, "ncclSend"); + recv_ = safe_load(recv_, "ncclRecv"); + group_start_ = safe_load(group_start_, "ncclGroupStart"); + group_end_ = safe_load(group_end_, "ncclGroupEnd"); + get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString"); + get_version_ = safe_load(get_version_, "ncclGetVersion"); + + std::int32_t v; + CHECK_EQ(get_version_(&v), ncclSuccess); + auto patch = v % 100; + auto minor = (v / 100) % 100; + auto major = v / 10000; + + LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_ + << "`" << std::endl; +#else + allreduce_ = ncclAllReduce; + broadcast_ = ncclBroadcast; + allgather_ = ncclAllGather; + comm_init_rank_ = ncclCommInitRank; + comm_destroy_ = ncclCommDestroy; + get_uniqueid_ = ncclGetUniqueId; + send_ = ncclSend; + recv_ = ncclRecv; + group_start_ = ncclGroupStart; + group_end_ = ncclGroupEnd; + get_error_string_ = ncclGetErrorString; + get_version_ = ncclGetVersion; +#endif +}; + +NcclStub::~NcclStub() { // NOLINT +#if defined(XGBOOST_USE_DLOPEN_NCCL) + if (handle_) { + auto rc = dlclose(handle_); + if (rc != 0) { + LOG(WARNING) << "Failed to close NCCL handle:" << dlerror(); + } + } + handle_ = nullptr; +#endif // defined(XGBOOST_USE_DLOPEN_NCCL) +} +} // namespace xgboost::collective +#endif // defined(XGBOOST_USE_NCCL) diff --git a/src/collective/nccl_stub.h b/src/collective/nccl_stub.h new file mode 100644 index 000000000..a003a6f22 --- /dev/null +++ b/src/collective/nccl_stub.h @@ -0,0 +1,94 @@ +/** + * Copyright 2023, XGBoost Contributors + */ +#pragma once +#if defined(XGBOOST_USE_NCCL) +#include +#include + +#include // for string + +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::collective { +class NcclStub { +#if defined(XGBOOST_USE_DLOPEN_NCCL) + void* handle_{nullptr}; +#endif // defined(XGBOOST_USE_DLOPEN_NCCL) + std::string path_; + + decltype(ncclAllReduce)* allreduce_{nullptr}; + decltype(ncclBroadcast)* broadcast_{nullptr}; + decltype(ncclAllGather)* allgather_{nullptr}; + decltype(ncclCommInitRank)* comm_init_rank_{nullptr}; + decltype(ncclCommDestroy)* comm_destroy_{nullptr}; + decltype(ncclGetUniqueId)* get_uniqueid_{nullptr}; + decltype(ncclSend)* send_{nullptr}; + decltype(ncclRecv)* recv_{nullptr}; + decltype(ncclGroupStart)* group_start_{nullptr}; + decltype(ncclGroupEnd)* group_end_{nullptr}; + decltype(ncclGetErrorString)* get_error_string_{nullptr}; + decltype(ncclGetVersion)* get_version_{nullptr}; + + public: + explicit NcclStub(StringView path); + ~NcclStub(); + + [[nodiscard]] ncclResult_t Allreduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) const { + CHECK(allreduce_); + return this->allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream); + } + [[nodiscard]] ncclResult_t Broadcast(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, + cudaStream_t stream) const { + CHECK(broadcast_); + return this->broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream); + } + [[nodiscard]] ncclResult_t Allgather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, + cudaStream_t stream) const { + CHECK(allgather_); + return this->allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream); + } + [[nodiscard]] ncclResult_t CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, + int rank) const { + CHECK(comm_init_rank_); + return this->comm_init_rank_(comm, nranks, commId, rank); + } + [[nodiscard]] ncclResult_t CommDestroy(ncclComm_t comm) const { + CHECK(comm_destroy_); + return this->comm_destroy_(comm); + } + + [[nodiscard]] ncclResult_t GetUniqueId(ncclUniqueId* uniqueId) const { + CHECK(get_uniqueid_); + return this->get_uniqueid_(uniqueId); + } + [[nodiscard]] ncclResult_t Send(const void* sendbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, cudaStream_t stream) { + CHECK(send_); + return send_(sendbuff, count, datatype, peer, comm, stream); + } + [[nodiscard]] ncclResult_t Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) const { + CHECK(recv_); + return recv_(recvbuff, count, datatype, peer, comm, stream); + } + [[nodiscard]] ncclResult_t GroupStart() const { + CHECK(group_start_); + return group_start_(); + } + [[nodiscard]] ncclResult_t GroupEnd() const { + CHECK(group_end_); + return group_end_(); + } + + [[nodiscard]] const char* GetErrorString(ncclResult_t result) const { + return get_error_string_(result); + } +}; +} // namespace xgboost::collective + +#endif // defined(XGBOOST_USE_NCCL) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 066f8a3e6..89ec42f2b 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -115,30 +115,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT } namespace dh { -#ifdef XGBOOST_USE_NCCL -#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__) - -inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) { - if (code != ncclSuccess) { - std::stringstream ss; - ss << "NCCL failure: " << ncclGetErrorString(code) << "."; - ss << " " << file << "(" << line << ")\n"; - if (code == ncclUnhandledCudaError) { - // nccl usually preserves the last error so we can get more details. - auto err = cudaPeekAtLastError(); - ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n"; - } else if (code == ncclSystemError) { - ss << " This might be caused by a network configuration issue. Please consider specifying " - "the network interface for NCCL via environment variables listed in its reference: " - "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n"; - } - LOG(FATAL) << ss.str(); - } - - return code; -} -#endif - inline int32_t CudaGetPointerDevice(void const *ptr) { int32_t device = -1; cudaPointerAttributes attr; diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh index 46bc98028..615608249 100755 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ b/tests/buildkite/build-cuda-with-rmm.sh @@ -21,11 +21,18 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/prune_libnccl.sh -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \ - -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ - -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ - -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} +$command_wrapper tests/ci_build/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DPLUGIN_RMM=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} echo "--- Build binary wheel" $command_wrapper bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index 1926754b8..7bd3492a2 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -21,11 +21,17 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/prune_libnccl.sh -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \ - -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ - -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} +$command_wrapper tests/ci_build/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} echo "--- Build binary wheel" $command_wrapper bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh index 58d250308..36f54cd3d 100755 --- a/tests/buildkite/test-cpp-gpu.sh +++ b/tests/buildkite/test-cpp-gpu.sh @@ -10,6 +10,7 @@ chmod +x build/testxgboost tests/ci_build/ci_build.sh gpu nvidia-docker \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ + --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ build/testxgboost echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh index 935a301a6..2aac47407 100755 --- a/tests/buildkite/test-cpp-mgpu.sh +++ b/tests/buildkite/test-cpp-mgpu.sh @@ -13,4 +13,5 @@ chmod +x build/testxgboost tests/ci_build/ci_build.sh gpu nvidia-docker \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ + --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh index a575878d3..c2376c021 100755 --- a/tests/buildkite/test-python-gpu.sh +++ b/tests/buildkite/test-python-gpu.sh @@ -24,7 +24,8 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" + `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "` + `"NCCL_VERSION_ARG=$NCCL_VERSION" # Run specified test suite case "$suite" in diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index 0822767c5..0a5adb6ea 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 ARG CUDA_VERSION_ARG ARG RAPIDS_VERSION_ARG +ARG NCCL_VERSION_ARG # Environment ENV DEBIAN_FRONTEND noninteractive @@ -23,7 +24,9 @@ RUN \ conda install -c conda-forge mamba && \ mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \ python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ - dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ + nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \ + dask \ + dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ pyspark>=3.4.0 cloudpickle cuda-python && \ mamba clean --all && \ diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 98a0a7033..16445de2a 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -27,7 +27,7 @@ RUN \ wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ yum -y update && \ - yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \ + yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} && \ rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm; ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH diff --git a/tests/ci_build/prune_libnccl.sh b/tests/ci_build/prune_libnccl.sh deleted file mode 100755 index c5a0d8123..000000000 --- a/tests/ci_build/prune_libnccl.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -set -e - -rm -rf tmp_nccl - -mkdir tmp_nccl -pushd tmp_nccl - -set -x - -cat << EOF > test.cu -int main(void) { return 0; } -EOF - -cat << EOF > CMakeLists.txt -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(gencode_extractor CXX C) -cmake_policy(SET CMP0104 NEW) -set(CMAKE_CUDA_HOST_COMPILER \${CMAKE_CXX_COMPILER}) -enable_language(CUDA) -include(../cmake/Utils.cmake) -compute_cmake_cuda_archs("") -add_library(test OBJECT test.cu) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -EOF - -cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ') - -nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a - -popd -rm -rf tmp_nccl - -set +x diff --git a/tests/ci_build/rename_whl.py b/tests/ci_build/rename_whl.py index 766c88a2f..2da7db8de 100644 --- a/tests/ci_build/rename_whl.py +++ b/tests/ci_build/rename_whl.py @@ -1,22 +1,10 @@ import os import sys -from contextlib import contextmanager - - -@contextmanager -def cd(path): - path = os.path.normpath(path) - cwd = os.getcwd() - os.chdir(path) - print("cd " + path) - try: - yield path - finally: - os.chdir(cwd) +from test_utils import DirectoryExcursion if len(sys.argv) != 4: - print('Usage: {} [wheel to rename] [commit id] [platform tag]'.format(sys.argv[0])) + print("Usage: {} [wheel to rename] [commit id] [platform tag]".format(sys.argv[0])) sys.exit(1) @@ -26,20 +14,26 @@ platform_tag = sys.argv[3] dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path) -with cd(dirname): - tokens = basename.split('-') +with DirectoryExcursion(dirname): + tokens = basename.split("-") assert len(tokens) == 5 - version = tokens[1].split('+')[0] - keywords = {'pkg_name': tokens[0], - 'version': version, - 'commit_id': commit_id, - 'platform_tag': platform_tag} - new_name = '{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl'.format(**keywords) - print('Renaming {} to {}...'.format(basename, new_name)) + version = tokens[1].split("+")[0] + keywords = { + "pkg_name": tokens[0], + "version": version, + "commit_id": commit_id, + "platform_tag": platform_tag, + } + new_name = "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format( + **keywords + ) + print("Renaming {} to {}...".format(basename, new_name)) if os.path.isfile(new_name): os.remove(new_name) os.rename(basename, new_name) filesize = os.path.getsize(new_name) / 1024 / 1024 # MB + print(f"Wheel size: {filesize}") + msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}" assert filesize <= 300, msg diff --git a/tests/cpp/collective/test_allgather.cu b/tests/cpp/collective/test_allgather.cu index 48f7c2615..236108198 100644 --- a/tests/cpp/collective/test_allgather.cu +++ b/tests/cpp/collective/test_allgather.cu @@ -90,10 +90,10 @@ class Worker : public NCCLWorkerForTest { } }; -class AllgatherTestGPU : public SocketTest {}; +class MGPUAllgatherTest : public SocketTest {}; } // namespace -TEST_F(AllgatherTestGPU, MGPUTestVRing) { +TEST_F(MGPUAllgatherTest, MGPUTestVRing) { auto n_workers = common::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { @@ -104,7 +104,7 @@ TEST_F(AllgatherTestGPU, MGPUTestVRing) { }); } -TEST_F(AllgatherTestGPU, MGPUTestVBcast) { +TEST_F(MGPUAllgatherTest, MGPUTestVBcast) { auto n_workers = common::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu index af9a4e58f..04ec9f773 100644 --- a/tests/cpp/collective/test_allreduce.cu +++ b/tests/cpp/collective/test_allreduce.cu @@ -5,17 +5,15 @@ #include #include // for host_vector -#include "../../../src/collective/coll.h" // for Coll #include "../../../src/common/common.h" #include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector #include "../../../src/common/type.h" // for EraseType -#include "../helpers.h" // for MakeCUDACtx #include "test_worker.cuh" // for NCCLWorkerForTest #include "test_worker.h" // for WorkerForTest, TestDistributed namespace xgboost::collective { namespace { -class AllreduceTestGPU : public SocketTest {}; +class MGPUAllreduceTest : public SocketTest {}; class Worker : public NCCLWorkerForTest { public: @@ -47,7 +45,7 @@ class Worker : public NCCLWorkerForTest { }; } // namespace -TEST_F(AllreduceTestGPU, BitOr) { +TEST_F(MGPUAllreduceTest, BitOr) { auto n_workers = common::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { @@ -57,7 +55,7 @@ TEST_F(AllreduceTestGPU, BitOr) { }); } -TEST_F(AllreduceTestGPU, Sum) { +TEST_F(MGPUAllreduceTest, Sum) { auto n_workers = common::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu index a09696c19..3d7b1efc8 100644 --- a/tests/cpp/collective/test_nccl_device_communicator.cu +++ b/tests/cpp/collective/test_nccl_device_communicator.cu @@ -8,6 +8,7 @@ #include #include // for string +#include "../../../src/collective/comm.cuh" #include "../../../src/collective/communicator-inl.cuh" #include "../../../src/collective/nccl_device_communicator.cuh" #include "../helpers.h" @@ -16,17 +17,15 @@ namespace xgboost { namespace collective { TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) { - auto construct = []() { NcclDeviceCommunicator comm{-1, false}; }; + auto construct = []() { NcclDeviceCommunicator comm{-1, false, DefaultNcclName()}; }; EXPECT_THROW(construct(), dmlc::Error); } TEST(NcclDeviceCommunicatorSimpleTest, SystemError) { - try { - dh::safe_nccl(ncclSystemError); - } catch (dmlc::Error const& e) { - auto str = std::string{e.what()}; - ASSERT_TRUE(str.find("environment variables") != std::string::npos); - } + auto stub = std::make_shared(DefaultNcclName()); + auto rc = GetNCCLResult(stub, ncclSystemError); + auto msg = rc.Report(); + ASSERT_TRUE(msg.find("environment variables") != std::string::npos); } namespace { diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h index 490cdf13c..acee0f297 100644 --- a/tests/cpp/collective/test_worker.h +++ b/tests/cpp/collective/test_worker.h @@ -33,7 +33,7 @@ class WorkerForTest { tracker_port_{port}, world_size_{world}, task_id_{"t:" + std::to_string(rank)}, - comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} { + comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} { CHECK_EQ(world_size_, comm_.World()); } virtual ~WorkerForTest() = default; diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index 893582ee1..883dbbaf2 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -12,6 +12,7 @@ from hypothesis._settings import duration import xgboost as xgb from xgboost import testing as tm +from xgboost.collective import CommunicatorContext from xgboost.testing.params import hist_parameter_strategy pytestmark = [ @@ -572,6 +573,65 @@ def test_with_asyncio(local_cuda_client: Client) -> None: assert isinstance(output["history"], dict) +def test_invalid_nccl(local_cuda_client: Client) -> None: + client = local_cuda_client + workers = tm.get_client_workers(client) + args = client.sync( + dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client + ) + + def run(wid: int) -> None: + ctx = CommunicatorContext(dmlc_nccl_path="foo", **args) + X, y, w = tm.make_regression(n_samples=10, n_features=10, use_cupy=True) + + with ctx: + with pytest.raises(ValueError, match=r"pip install"): + xgb.QuantileDMatrix(X, y, weight=w) + + futures = client.map(run, range(len(workers)), workers=workers) + client.gather(futures) + + +@pytest.mark.parametrize("tree_method", ["hist", "approx"]) +def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None: + X, y, w = tm.make_regression(128, 16, use_cupy=True) + + def make_model() -> None: + xgb.XGBRegressor( + device="cuda", + tree_method=tree_method, + objective="reg:quantileerror", + verbosity=2, + quantile_alpha=[0.2, 0.8], + ).fit(X, y, sample_weight=w) + + # no nccl load when using single-node. + with tm.captured_output() as (out, err): + make_model() + assert out.getvalue().find("NCCL") == -1 + assert err.getvalue().find("NCCL") == -1 + + client = local_cuda_client + workers = tm.get_client_workers(client) + args = client.sync( + dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client + ) + + # nccl is loaded + def run(wid: int) -> None: + # FIXME(jiamingy): https://github.com/dmlc/xgboost/issues/9147 + from xgboost.core import _LIB, _register_log_callback + _register_log_callback(_LIB) + + with CommunicatorContext(**args): + with tm.captured_output() as (out, err): + make_model() + assert out.getvalue().find("Loaded shared NCCL") != -1, out.getvalue() + + futures = client.map(run, range(len(workers)), workers=workers) + client.gather(futures) + + async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT: async with Client(scheduler_address, asynchronous=True) as client: import cupy as cp