Use dlopen to load NCCL. (#9796)

This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release.
- Add CMake option to load `nccl` at runtime.
- Add an NCCL stub.

After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change.

At the moment, this is Linux only since we only support MNMG on Linux.
This commit is contained in:
Jiaming Yuan 2023-11-22 19:27:31 +08:00 committed by GitHub
parent fedd9674c8
commit 0715ab3c10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 658 additions and 268 deletions

View File

@ -69,7 +69,10 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar
option(USE_CUDA "Build with GPU acceleration" OFF) option(USE_CUDA "Build with GPU acceleration" OFF)
option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON) option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
if(USE_CUDA) if(USE_CUDA)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS}) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS})
set(GPU_COMPUTE_VER "" CACHE STRING set(GPU_COMPUTE_VER "" CACHE STRING
@ -80,6 +83,7 @@ if(USE_CUDA)
unset(GPU_COMPUTE_VER CACHE) unset(GPU_COMPUTE_VER CACHE)
endif() endif()
endif() endif()
# CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still # CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still
# be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO # be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO
# is not a supproted configuration # is not a supproted configuration
@ -115,6 +119,12 @@ endif()
if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL)) if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.") message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
endif() endif()
if(USE_DLOPEN_NCCL AND (NOT USE_NCCL))
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable USE_DLOPEN_NCCL.")
endif()
if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.")
endif()
if(JVM_BINDINGS AND R_LIB) if(JVM_BINDINGS AND R_LIB)
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.") message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
endif() endif()

View File

@ -171,17 +171,24 @@ function(xgboost_set_cuda_flags target)
endif() endif()
endfunction() endfunction()
macro(xgboost_link_nccl target) function(xgboost_link_nccl target)
set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1)
if(USE_DLOPEN_NCCL)
list(APPEND xgboost_nccl_flags -DXGBOOST_USE_DLOPEN_NCCL=1)
endif()
if(BUILD_STATIC_LIB) if(BUILD_STATIC_LIB)
target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR}) target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1) target_compile_definitions(${target} PUBLIC ${xgboost_nccl_flags})
target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY}) target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
else() else()
target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR}) target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1) target_compile_definitions(${target} PRIVATE ${xgboost_nccl_flags})
target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY}) if(NOT USE_DLOPEN_NCCL)
target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
endif()
endif() endif()
endmacro() endfunction()
# compile options # compile options
macro(xgboost_target_properties target) macro(xgboost_target_properties target)

View File

@ -54,17 +54,24 @@ find_path(NCCL_INCLUDE_DIR
NAMES nccl.h NAMES nccl.h
HINTS ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include) HINTS ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include)
find_library(NCCL_LIBRARY if(USE_DLOPEN_NCCL)
NAMES ${NCCL_LIB_NAME} include(FindPackageHandleStandardArgs)
HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/) find_package_handle_standard_args(Nccl DEFAULT_MSG NCCL_INCLUDE_DIR)
message(STATUS "Using nccl library: ${NCCL_LIBRARY}") mark_as_advanced(NCCL_INCLUDE_DIR)
else()
find_library(NCCL_LIBRARY
NAMES ${NCCL_LIB_NAME}
HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
include(FindPackageHandleStandardArgs) message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_INCLUDE_DIR NCCL_LIBRARY)
mark_as_advanced( include(FindPackageHandleStandardArgs)
NCCL_INCLUDE_DIR find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_LIBRARY NCCL_INCLUDE_DIR NCCL_LIBRARY)
)
mark_as_advanced(
NCCL_INCLUDE_DIR
NCCL_LIBRARY
)
endif()

View File

@ -536,6 +536,37 @@ Troubleshooting
- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
that includes `Multiple processes within a communication group ...` upon initialization. that includes `Multiple processes within a communication group ...` upon initialization.
.. _nccl-load:
- Starting from version 2.1.0, to reduce the size of the binary wheel, the XGBoost package
(installed using pip) loads NCCL from the environment instead of bundling it
directly. This means that if you encounter an error message like
"Failed to load nccl ...", it indicates that NCCL is not installed or properly
configured in your environment.
To resolve this issue, you can install NCCL using pip:
.. code-block:: sh
pip install nvidia-nccl-cu12 # (or with any compatible CUDA version)
The default conda installation of XGBoost should not encounter this error. If you are
using a customized XGBoost, please make sure one of the followings is true:
+ XGBoost is NOT compiled with the `USE_DLOPEN_NCCL` flag.
+ The `dmlc_nccl_path` parameter is set to full NCCL path when initializing the collective.
Here are some additional tips for troubleshooting NCCL dependency issues:
+ Check the NCCL installation path and verify that it's installed correctly. We try to
find NCCL by using ``from nvidia.nccl import lib`` in Python when XGBoost is installed
using pip.
+ Ensure that you have the correct CUDA version installed. NCCL requires a compatible
CUDA version to function properly.
+ If you are not using distributed training with XGBoost and yet see this error, please
open an issue on GitHub.
+ If you continue to encounter NCCL dependency issues, please open an issue on GitHub.
************ ************
IPv6 Support IPv6 Support
************ ************

View File

@ -1613,6 +1613,8 @@ XGB_DLL int XGTrackerFree(TrackerHandle handle);
* - DMLC_TRACKER_PORT: Port number of the tracker. * - DMLC_TRACKER_PORT: Port number of the tracker.
* - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment. * - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
* - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker. * - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
* - dmlc_nccl_path: The path to NCCL shared object. Only used if XGBoost is compiled with
* `USE_DLOPEN_NCCL`.
* Only applicable to the Federated communicator (use upper case for environment variables, use * Only applicable to the Federated communicator (use upper case for environment variables, use
* lower case for runtime configuration): * lower case for runtime configuration):
* - federated_server_address: Address of the federated server. * - federated_server_address: Address of the federated server.

View File

@ -1,23 +1,24 @@
/** /**
* Copyright 2021-2023 by XGBoost Contributors * Copyright 2021-2023, XGBoost Contributors
*/ */
#ifndef XGBOOST_STRING_VIEW_H_ #ifndef XGBOOST_STRING_VIEW_H_
#define XGBOOST_STRING_VIEW_H_ #define XGBOOST_STRING_VIEW_H_
#include <xgboost/logging.h> // CHECK_LT #include <xgboost/logging.h> // CHECK_LT
#include <xgboost/span.h> // Span #include <xgboost/span.h> // Span
#include <algorithm> // std::equal,std::min #include <algorithm> // for equal, min
#include <iterator> // std::reverse_iterator #include <cstddef> // for size_t
#include <ostream> // std::ostream #include <iterator> // for reverse_iterator
#include <string> // std::char_traits,std::string #include <ostream> // for ostream
#include <string> // for char_traits, string
namespace xgboost { namespace xgboost {
struct StringView { struct StringView {
private: private:
using CharT = char; // unsigned char using CharT = char;
using Traits = std::char_traits<CharT>; using Traits = std::char_traits<CharT>;
CharT const* str_{nullptr}; CharT const* str_{nullptr};
size_t size_{0}; std::size_t size_{0};
public: public:
using value_type = CharT; // NOLINT using value_type = CharT; // NOLINT
@ -28,40 +29,41 @@ struct StringView {
public: public:
constexpr StringView() = default; constexpr StringView() = default;
constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {} constexpr StringView(value_type const* str, std::size_t size) : str_{str}, size_{size} {}
StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} // NOLINT StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} // NOLINT
constexpr StringView(CharT const* str) // NOLINT constexpr StringView(value_type const* str) // NOLINT
: str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {} : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}
CharT const& operator[](size_t p) const { return str_[p]; } [[nodiscard]] value_type const& operator[](std::size_t p) const { return str_[p]; }
CharT const& at(size_t p) const { // NOLINT [[nodiscard]] explicit operator std::string() const { return {this->c_str(), this->size()}; }
[[nodiscard]] value_type const& at(std::size_t p) const { // NOLINT
CHECK_LT(p, size_); CHECK_LT(p, size_);
return str_[p]; return str_[p];
} }
constexpr std::size_t size() const { return size_; } // NOLINT [[nodiscard]] constexpr std::size_t size() const { return size_; } // NOLINT
constexpr bool empty() const { return size() == 0; } // NOLINT [[nodiscard]] constexpr bool empty() const { return size() == 0; } // NOLINT
StringView substr(size_t beg, size_t n) const { // NOLINT [[nodiscard]] StringView substr(std::size_t beg, std::size_t n) const { // NOLINT
CHECK_LE(beg, size_); CHECK_LE(beg, size_);
size_t len = std::min(n, size_ - beg); std::size_t len = std::min(n, size_ - beg);
return {str_ + beg, len}; return {str_ + beg, len};
} }
CharT const* c_str() const { return str_; } // NOLINT [[nodiscard]] value_type const* c_str() const { return str_; } // NOLINT
constexpr CharT const* cbegin() const { return str_; } // NOLINT [[nodiscard]] constexpr const_iterator cbegin() const { return str_; } // NOLINT
constexpr CharT const* cend() const { return str_ + size(); } // NOLINT [[nodiscard]] constexpr const_iterator cend() const { return str_ + size(); } // NOLINT
constexpr CharT const* begin() const { return str_; } // NOLINT [[nodiscard]] constexpr iterator begin() const { return str_; } // NOLINT
constexpr CharT const* end() const { return str_ + size(); } // NOLINT [[nodiscard]] constexpr iterator end() const { return str_ + size(); } // NOLINT
const_reverse_iterator rbegin() const noexcept { // NOLINT [[nodiscard]] const_reverse_iterator rbegin() const noexcept { // NOLINT
return const_reverse_iterator(this->end()); return const_reverse_iterator(this->end());
} }
const_reverse_iterator crbegin() const noexcept { // NOLINT [[nodiscard]] const_reverse_iterator crbegin() const noexcept { // NOLINT
return const_reverse_iterator(this->end()); return const_reverse_iterator(this->end());
} }
const_reverse_iterator rend() const noexcept { // NOLINT [[nodiscard]] const_reverse_iterator rend() const noexcept { // NOLINT
return const_reverse_iterator(this->begin()); return const_reverse_iterator(this->begin());
} }
const_reverse_iterator crend() const noexcept { // NOLINT [[nodiscard]] const_reverse_iterator crend() const noexcept { // NOLINT
return const_reverse_iterator(this->begin()); return const_reverse_iterator(this->begin());
} }
}; };

View File

@ -103,6 +103,7 @@ if __name__ == "__main__":
if cli_args.use_cuda == 'ON': if cli_args.use_cuda == 'ON':
CONFIG['USE_CUDA'] = 'ON' CONFIG['USE_CUDA'] = 'ON'
CONFIG['USE_NCCL'] = 'ON' CONFIG['USE_NCCL'] = 'ON'
CONFIG["USE_DLOPEN_NCCL"] = "OFF"
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()] args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]

View File

@ -5,9 +5,11 @@
#include <memory> // for shared_ptr #include <memory> // for shared_ptr
#include "../../src/collective/coll.h" // for Coll
#include "../../src/common/device_helpers.cuh" // for CUDAStreamView #include "../../src/common/device_helpers.cuh" // for CUDAStreamView
#include "federated_comm.h" // for FederatedComm #include "federated_comm.h" // for FederatedComm
#include "xgboost/context.h" // for Context #include "xgboost/context.h" // for Context
#include "xgboost/logging.h"
namespace xgboost::collective { namespace xgboost::collective {
class CUDAFederatedComm : public FederatedComm { class CUDAFederatedComm : public FederatedComm {
@ -16,5 +18,9 @@ class CUDAFederatedComm : public FederatedComm {
public: public:
explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl); explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
[[nodiscard]] auto Stream() const { return stream_; } [[nodiscard]] auto Stream() const { return stream_; }
Comm* MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const override {
LOG(FATAL) << "[Internal Error]: Invalid request for CUDA variant.";
return nullptr;
}
}; };
} // namespace xgboost::collective } // namespace xgboost::collective

View File

@ -10,12 +10,12 @@
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <string> // for string #include <string> // for string
#include "../../src/collective/comm.h" // for Comm #include "../../src/collective/comm.h" // for HostComm
#include "../../src/common/json_utils.h" // for OptionalArg #include "../../src/common/json_utils.h" // for OptionalArg
#include "xgboost/json.h" #include "xgboost/json.h"
namespace xgboost::collective { namespace xgboost::collective {
class FederatedComm : public Comm { class FederatedComm : public HostComm {
std::shared_ptr<federated::Federated::Stub> stub_; std::shared_ptr<federated::Federated::Stub> stub_;
void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank, void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank,
@ -64,6 +64,6 @@ class FederatedComm : public Comm {
[[nodiscard]] bool IsFederated() const override { return true; } [[nodiscard]] bool IsFederated() const override { return true; }
[[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); } [[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override; [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
}; };
} // namespace xgboost::collective } // namespace xgboost::collective

View File

@ -15,6 +15,8 @@ class BuildConfiguration: # pylint: disable=R0902
use_cuda: bool = False use_cuda: bool = False
# Whether to enable NCCL # Whether to enable NCCL
use_nccl: bool = False use_nccl: bool = False
# Whether to load nccl dynamically
use_dlopen_nccl: bool = False
# Whether to enable HDFS # Whether to enable HDFS
use_hdfs: bool = False use_hdfs: bool = False
# Whether to enable Azure Storage # Whether to enable Azure Storage

View File

@ -29,7 +29,8 @@ classifiers = [
] ]
dependencies = [ dependencies = [
"numpy", "numpy",
"scipy" "scipy",
"nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
] ]
[project.urls] [project.urls]

View File

@ -2,14 +2,15 @@
import ctypes import ctypes
import json import json
import logging import logging
import os
import pickle import pickle
from enum import IntEnum, unique from enum import IntEnum, unique
from typing import Any, Dict, List from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
from ._typing import _T from ._typing import _T
from .core import _LIB, _check_call, c_str, from_pystr_to_cstr, py_str from .core import _LIB, _check_call, build_info, c_str, from_pystr_to_cstr, py_str
LOGGER = logging.getLogger("[xgboost.collective]") LOGGER = logging.getLogger("[xgboost.collective]")
@ -250,6 +251,31 @@ class CommunicatorContext:
def __init__(self, **args: Any) -> None: def __init__(self, **args: Any) -> None:
self.args = args self.args = args
key = "dmlc_nccl_path"
if args.get(key, None) is not None:
return
binfo = build_info()
if not binfo["USE_DLOPEN_NCCL"]:
return
try:
# PyPI package of NCCL.
from nvidia.nccl import lib
# There are two versions of nvidia-nccl, one is from PyPI, another one from
# nvidia-pyindex. We support only the first one as the second one is too old
# (2.9.8 as of writing).
if lib.__file__ is not None:
dirname: Optional[str] = os.path.dirname(lib.__file__)
else:
dirname = None
if dirname:
path = os.path.join(dirname, "libnccl.so.2")
self.args[key] = path
except ImportError:
pass
def __enter__(self) -> Dict[str, Any]: def __enter__(self) -> Dict[str, Any]:
init(**self.args) init(**self.args)

View File

@ -184,6 +184,13 @@ def _py_version() -> str:
return f.read().strip() return f.read().strip()
def _register_log_callback(lib: ctypes.CDLL) -> None:
lib.XGBGetLastError.restype = ctypes.c_char_p
lib.callback = _get_log_callback_func() # type: ignore
if lib.XGBRegisterLogCallback(lib.callback) != 0:
raise XGBoostError(lib.XGBGetLastError())
def _load_lib() -> ctypes.CDLL: def _load_lib() -> ctypes.CDLL:
"""Load xgboost Library.""" """Load xgboost Library."""
lib_paths = find_lib_path() lib_paths = find_lib_path()
@ -228,10 +235,7 @@ Likely causes:
Error message(s): {os_error_list} Error message(s): {os_error_list}
""" """
) )
lib.XGBGetLastError.restype = ctypes.c_char_p _register_log_callback(lib)
lib.callback = _get_log_callback_func() # type: ignore
if lib.XGBRegisterLogCallback(lib.callback) != 0:
raise XGBoostError(lib.XGBGetLastError())
def parse(ver: str) -> Tuple[int, int, int]: def parse(ver: str) -> Tuple[int, int, int]:
"""Avoid dependency on packaging (PEP 440).""" """Avoid dependency on packaging (PEP 440)."""

View File

@ -7,8 +7,6 @@
#include <cinttypes> // for strtoimax #include <cinttypes> // for strtoimax
#include <cmath> // for nan #include <cmath> // for nan
#include <cstring> // for strcmp #include <cstring> // for strcmp
#include <fstream> // for operator<<, basic_ostream, ios, stringstream
#include <functional> // for less
#include <limits> // for numeric_limits #include <limits> // for numeric_limits
#include <map> // for operator!=, _Rb_tree_const_iterator, _Rb_tre... #include <map> // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
#include <memory> // for shared_ptr, allocator, __shared_ptr_access #include <memory> // for shared_ptr, allocator, __shared_ptr_access
@ -22,7 +20,6 @@
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch... #include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
#include "../common/hist_util.h" // for HistogramCuts #include "../common/hist_util.h" // for HistogramCuts
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf... #include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
#include "../common/linalg_op.h" // for ElementWiseTransformHost
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor #include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte... #include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
#include "../data/ellpack_page.h" // for EllpackPage #include "../data/ellpack_page.h" // for EllpackPage
@ -35,14 +32,12 @@
#include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager #include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager
#include "dmlc/thread_local.h" // for ThreadLocalStore #include "dmlc/thread_local.h" // for ThreadLocalStore
#include "rabit/c_api.h" // for RabitLinkTag #include "rabit/c_api.h" // for RabitLinkTag
#include "rabit/rabit.h" // for CheckPoint, LoadCheckPoint
#include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat... #include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat...
#include "xgboost/context.h" // for Context #include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage #include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage
#include "xgboost/feature_map.h" // for FeatureMap #include "xgboost/feature_map.h" // for FeatureMap
#include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal... #include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal...
#include "xgboost/host_device_vector.h" // for HostDeviceVector #include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/intrusive_ptr.h" // for xgboost
#include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String #include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String
#include "xgboost/learner.h" // for Learner, PredictionType #include "xgboost/learner.h" // for Learner, PredictionType
#include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ #include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
@ -79,6 +74,7 @@ void XGBBuildInfoDevice(Json *p_info) {
info["USE_CUDA"] = Boolean{false}; info["USE_CUDA"] = Boolean{false};
info["USE_NCCL"] = Boolean{false}; info["USE_NCCL"] = Boolean{false};
info["USE_RMM"] = Boolean{false}; info["USE_RMM"] = Boolean{false};
info["USE_DLOPEN_NCCL"] = Boolean{false};
} }
} // namespace xgboost } // namespace xgboost
#endif #endif

View File

@ -33,8 +33,16 @@ void XGBBuildInfoDevice(Json *p_info) {
info["USE_NCCL"] = Boolean{true}; info["USE_NCCL"] = Boolean{true};
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}}; v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
info["NCCL_VERSION"] = v; info["NCCL_VERSION"] = v;
#if defined(XGBOOST_USE_DLOPEN_NCCL)
info["USE_DLOPEN_NCCL"] = Boolean{true};
#else
info["USE_DLOPEN_NCCL"] = Boolean{false};
#endif // defined(XGBOOST_USE_DLOPEN_NCCL)
#else #else
info["USE_NCCL"] = Boolean{false}; info["USE_NCCL"] = Boolean{false};
info["USE_DLOPEN_NCCL"] = Boolean{false};
#endif #endif
#if defined(XGBOOST_USE_RMM) #if defined(XGBOOST_USE_RMM)

View File

@ -19,25 +19,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; }
NCCLColl::~NCCLColl() = default; NCCLColl::~NCCLColl() = default;
namespace { namespace {
Result GetNCCLResult(ncclResult_t code) {
if (code == ncclSuccess) {
return Success();
}
std::stringstream ss;
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
return Fail(ss.str());
}
auto GetNCCLType(ArrayInterfaceHandler::Type type) { auto GetNCCLType(ArrayInterfaceHandler::Type type) {
auto fatal = [] { auto fatal = [] {
LOG(FATAL) << "Invalid type for NCCL operation."; LOG(FATAL) << "Invalid type for NCCL operation.";
@ -94,11 +75,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
common::Span<std::int8_t> data, Op op) { common::Span<std::int8_t> data, Op op) {
dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World()); dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
auto* device_buffer = buffer.data().get(); auto* device_buffer = buffer.data().get();
auto stub = pcomm->Stub();
// First gather data from all the workers. // First gather data from all the workers.
CHECK(handle); CHECK(handle);
auto rc = GetNCCLResult( auto rc = GetNCCLResult(stub, stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8,
ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream())); handle, pcomm->Stream()));
if (!rc.OK()) { if (!rc.OK()) {
return rc; return rc;
} }
@ -149,6 +131,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
} }
auto nccl = dynamic_cast<NCCLComm const*>(&comm); auto nccl = dynamic_cast<NCCLComm const*>(&comm);
CHECK(nccl); CHECK(nccl);
auto stub = nccl->Stub();
return Success() << [&] { return Success() << [&] {
if (IsBitwiseOp(op)) { if (IsBitwiseOp(op)) {
return BitwiseAllReduce(nccl, nccl->Handle(), data, op); return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
@ -156,9 +140,9 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
return DispatchDType(type, [=](auto t) { return DispatchDType(type, [=](auto t) {
using T = decltype(t); using T = decltype(t);
auto rdata = common::RestoreType<T>(data); auto rdata = common::RestoreType<T>(data);
auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type), auto rc = stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
GetNCCLRedOp(op), nccl->Handle(), nccl->Stream()); GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
return GetNCCLResult(rc); return GetNCCLResult(stub, rc);
}); });
} }
} << [&] { return nccl->Block(); }; } << [&] { return nccl->Block(); };
@ -171,9 +155,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
} }
auto nccl = dynamic_cast<NCCLComm const*>(&comm); auto nccl = dynamic_cast<NCCLComm const*>(&comm);
CHECK(nccl); CHECK(nccl);
auto stub = nccl->Stub();
return Success() << [&] { return Success() << [&] {
return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root, return GetNCCLResult(stub, stub->Broadcast(data.data(), data.data(), data.size_bytes(),
nccl->Handle(), nccl->Stream())); ncclInt8, root, nccl->Handle(), nccl->Stream()));
} << [&] { return nccl->Block(); }; } << [&] { return nccl->Block(); };
} }
@ -184,10 +170,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
} }
auto nccl = dynamic_cast<NCCLComm const*>(&comm); auto nccl = dynamic_cast<NCCLComm const*>(&comm);
CHECK(nccl); CHECK(nccl);
auto stub = nccl->Stub();
auto send = data.subspan(comm.Rank() * size, size); auto send = data.subspan(comm.Rank() * size, size);
return Success() << [&] { return Success() << [&] {
return GetNCCLResult( return GetNCCLResult(stub, stub->Allgather(send.data(), data.data(), size, ncclInt8,
ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream())); nccl->Handle(), nccl->Stream()));
} << [&] { return nccl->Block(); }; } << [&] { return nccl->Block(); };
} }
@ -199,19 +187,20 @@ namespace cuda_impl {
*/ */
Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data, Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) { common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] { auto stub = comm->Stub();
return Success() << [&stub] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
std::size_t offset = 0; std::size_t offset = 0;
for (std::int32_t r = 0; r < comm->World(); ++r) { for (std::int32_t r = 0; r < comm->World(); ++r) {
auto as_bytes = sizes[r]; auto as_bytes = sizes[r];
auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes, auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
ncclInt8, r, comm->Handle(), dh::DefaultStream()); ncclInt8, r, comm->Handle(), dh::DefaultStream());
if (rc != ncclSuccess) { if (rc != ncclSuccess) {
return GetNCCLResult(rc); return GetNCCLResult(stub, rc);
} }
offset += as_bytes; offset += as_bytes;
} }
return Success(); return Success();
} << [] { return GetNCCLResult(ncclGroupEnd()); }; } << [&] { return GetNCCLResult(stub, stub->GroupEnd()); };
} }
} // namespace cuda_impl } // namespace cuda_impl
@ -224,10 +213,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
if (!comm.IsDistributed()) { if (!comm.IsDistributed()) {
return Success(); return Success();
} }
auto stub = nccl->Stub();
switch (algo) { switch (algo) {
case AllgatherVAlgo::kRing: { case AllgatherVAlgo::kRing: {
return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] { return Success() << [&] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
// get worker offset // get worker offset
detail::AllgatherVOffset(sizes, recv_segments); detail::AllgatherVOffset(sizes, recv_segments);
// copy data // copy data
@ -237,8 +227,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
cudaMemcpyDeviceToDevice, nccl->Stream())); cudaMemcpyDeviceToDevice, nccl->Stream()));
} }
return detail::RingAllgatherV(comm, sizes, recv_segments, recv); return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
} << [] { } << [&] {
return GetNCCLResult(ncclGroupEnd()); return GetNCCLResult(stub, stub->GroupEnd());
} << [&] { return nccl->Block(); }; } << [&] { return nccl->Block(); };
} }
case AllgatherVAlgo::kBcast: { case AllgatherVAlgo::kBcast: {

View File

@ -8,7 +8,8 @@
#include "../data/array_interface.h" // for ArrayInterfaceHandler #include "../data/array_interface.h" // for ArrayInterfaceHandler
#include "coll.h" // for Coll #include "coll.h" // for Coll
#include "comm.h" // for Comm #include "comm.h" // for Comm
#include "xgboost/span.h" // for Span #include "nccl_stub.h"
#include "xgboost/span.h" // for Span
namespace xgboost::collective { namespace xgboost::collective {
class NCCLColl : public Coll { class NCCLColl : public Coll {

View File

@ -7,15 +7,12 @@
#include <chrono> // for seconds #include <chrono> // for seconds
#include <cstdlib> // for exit #include <cstdlib> // for exit
#include <memory> // for shared_ptr #include <memory> // for shared_ptr
#include <mutex> // for unique_lock
#include <string> // for string #include <string> // for string
#include <utility> // for move, forward #include <utility> // for move, forward
#include "../common/common.h" // for AssertGPUSupport #include "../common/common.h" // for AssertGPUSupport
#include "../common/json_utils.h" // for OptionalArg
#include "allgather.h" // for RingAllgather #include "allgather.h" // for RingAllgather
#include "protocol.h" // for kMagic #include "protocol.h" // for kMagic
#include "tracker.h" // for GetHostAddress
#include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE #include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE
#include "xgboost/collective/socket.h" // for TCPSocket #include "xgboost/collective/socket.h" // for TCPSocket
#include "xgboost/json.h" // for Json, Object #include "xgboost/json.h" // for Json, Object
@ -62,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
this->Rank(), this->World()); this->Rank(), this->World());
} }
#if !defined(XGBOOST_USE_NCCL)
Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
common::AssertGPUSupport();
common::AssertNCCLSupport();
return nullptr;
}
#endif // !defined(XGBOOST_USE_NCCL)
[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport, [[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
proto::PeerInfo ninfo, std::chrono::seconds timeout, proto::PeerInfo ninfo, std::chrono::seconds timeout,
std::int32_t retry, std::int32_t retry,
@ -194,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
} }
RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t retry, std::string task_id) std::int32_t retry, std::string task_id, StringView nccl_path)
: Comm{std::move(host), port, timeout, retry, std::move(task_id)} { : HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
nccl_path_{std::move(nccl_path)} {
auto rc = this->Bootstrap(timeout_, retry_, task_id_); auto rc = this->Bootstrap(timeout_, retry_, task_id_);
CHECK(rc.OK()) << rc.Report(); CHECK(rc.OK()) << rc.Report();
} }
#if !defined(XGBOOST_USE_NCCL)
Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
common::AssertGPUSupport();
common::AssertNCCLSupport();
return nullptr;
}
#endif // !defined(XGBOOST_USE_NCCL)
[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry, [[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
std::string task_id) { std::string task_id) {
TCPSocket tracker; TCPSocket tracker;

View File

@ -13,19 +13,21 @@
#include "../common/cuda_context.cuh" // for CUDAContext #include "../common/cuda_context.cuh" // for CUDAContext
#include "../common/device_helpers.cuh" // for DefaultStream #include "../common/device_helpers.cuh" // for DefaultStream
#include "../common/type.h" // for EraseType #include "../common/type.h" // for EraseType
#include "broadcast.h" // for Broadcast
#include "comm.cuh" // for NCCLComm #include "comm.cuh" // for NCCLComm
#include "comm.h" // for Comm #include "comm.h" // for Comm
#include "nccl_stub.h" // for NcclStub
#include "xgboost/collective/result.h" // for Result #include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span #include "xgboost/span.h" // for Span
namespace xgboost::collective { namespace xgboost::collective {
namespace { namespace {
Result GetUniqueId(Comm const& comm, std::shared_ptr<Coll> coll, ncclUniqueId* pid) { Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared_ptr<Coll> coll,
ncclUniqueId* pid) {
static const int kRootRank = 0; static const int kRootRank = 0;
ncclUniqueId id; ncclUniqueId id;
if (comm.Rank() == kRootRank) { if (comm.Rank() == kRootRank) {
dh::safe_nccl(ncclGetUniqueId(&id)); auto rc = GetNCCLResult(stub, stub->GetUniqueId(&id));
CHECK(rc.OK()) << rc.Report();
} }
auto rc = coll->Broadcast( auto rc = coll->Broadcast(
comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank); comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
@ -54,11 +56,12 @@ static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> c
} }
} // namespace } // namespace
Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const { Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
return new NCCLComm{ctx, *this, pimpl}; return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}};
} }
NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl) NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
StringView nccl_path)
: Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(), : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
root.TaskID()}, root.TaskID()},
stream_{ctx->CUDACtx()->Stream()} { stream_{ctx->CUDACtx()->Stream()} {
@ -70,6 +73,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
} }
dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
stub_ = std::make_shared<NcclStub>(nccl_path);
std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0); std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()}; auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
@ -95,19 +99,24 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
<< "Multiple processes within communication group running on same CUDA " << "Multiple processes within communication group running on same CUDA "
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n"; << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
rc = GetUniqueId(root, pimpl, &nccl_unique_id_); rc = std::move(rc) << [&] {
return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);
} << [&] {
return GetNCCLResult(this->stub_, this->stub_->CommInitRank(&nccl_comm_, root.World(),
nccl_unique_id_, root.Rank()));
};
CHECK(rc.OK()) << rc.Report(); CHECK(rc.OK()) << rc.Report();
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank()));
for (std::int32_t r = 0; r < root.World(); ++r) { for (std::int32_t r = 0; r < root.World(); ++r) {
this->channels_.emplace_back( this->channels_.emplace_back(
std::make_shared<NCCLChannel>(root, r, nccl_comm_, dh::DefaultStream())); std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
} }
} }
NCCLComm::~NCCLComm() { NCCLComm::~NCCLComm() {
if (nccl_comm_) { if (nccl_comm_) {
dh::safe_nccl(ncclCommDestroy(nccl_comm_)); auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
CHECK(rc.OK()) << rc.Report();
} }
} }
} // namespace xgboost::collective } // namespace xgboost::collective

View File

@ -6,9 +6,13 @@
#ifdef XGBOOST_USE_NCCL #ifdef XGBOOST_USE_NCCL
#include "nccl.h" #include "nccl.h"
#endif // XGBOOST_USE_NCCL #endif // XGBOOST_USE_NCCL
#include <utility> // for move
#include "../common/device_helpers.cuh" #include "../common/device_helpers.cuh"
#include "coll.h" #include "coll.h"
#include "comm.h" #include "comm.h"
#include "nccl_stub.h" // for NcclStub
#include "xgboost/context.h" #include "xgboost/context.h"
namespace xgboost::collective { namespace xgboost::collective {
@ -21,15 +25,20 @@ inline Result GetCUDAResult(cudaError rc) {
return Fail(msg); return Fail(msg);
} }
#if defined(XGBOOST_USE_NCCL)
class NCCLComm : public Comm { class NCCLComm : public Comm {
ncclComm_t nccl_comm_{nullptr}; ncclComm_t nccl_comm_{nullptr};
std::shared_ptr<NcclStub> stub_;
ncclUniqueId nccl_unique_id_{}; ncclUniqueId nccl_unique_id_{};
dh::CUDAStreamView stream_; dh::CUDAStreamView stream_;
std::string nccl_path_;
public: public:
[[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; } [[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
auto Stub() const { return stub_; }
explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl); explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
StringView nccl_path);
[[nodiscard]] Result LogTracker(std::string) const override { [[nodiscard]] Result LogTracker(std::string) const override {
LOG(FATAL) << "Device comm is used for logging."; LOG(FATAL) << "Device comm is used for logging.";
return Fail("Undefined."); return Fail("Undefined.");
@ -43,25 +52,53 @@ class NCCLComm : public Comm {
} }
}; };
inline Result GetNCCLResult(std::shared_ptr<NcclStub> stub, ncclResult_t code) {
if (code == ncclSuccess) {
return Success();
}
std::stringstream ss;
ss << "NCCL failure: " << stub->GetErrorString(code) << ".";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
return Fail(ss.str());
}
class NCCLChannel : public Channel { class NCCLChannel : public Channel {
std::int32_t rank_{-1}; std::int32_t rank_{-1};
ncclComm_t nccl_comm_{}; ncclComm_t nccl_comm_{};
std::shared_ptr<NcclStub> stub_;
dh::CUDAStreamView stream_; dh::CUDAStreamView stream_;
public: public:
explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm, explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
dh::CUDAStreamView stream) std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
: rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {} : rank_{rank},
nccl_comm_{nccl_comm},
stub_{std::move(stub)},
Channel{comm, nullptr},
stream_{stream} {}
void SendAll(std::int8_t const* ptr, std::size_t n) override { void SendAll(std::int8_t const* ptr, std::size_t n) override {
dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_)); auto rc = GetNCCLResult(stub_, stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
CHECK(rc.OK()) << rc.Report();
} }
void RecvAll(std::int8_t* ptr, std::size_t n) override { void RecvAll(std::int8_t* ptr, std::size_t n) override {
dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_)); auto rc = GetNCCLResult(stub_, stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
CHECK(rc.OK()) << rc.Report();
} }
[[nodiscard]] Result Block() override { [[nodiscard]] Result Block() override {
auto rc = stream_.Sync(false); auto rc = stream_.Sync(false);
return GetCUDAResult(rc); return GetCUDAResult(rc);
} }
}; };
#endif // defined(XGBOOST_USE_NCCL)
} // namespace xgboost::collective } // namespace xgboost::collective

View File

@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
return nrank; return nrank;
} }
inline StringView DefaultNcclName() { return "libnccl.so.2"; }
class Channel; class Channel;
class Coll; class Coll;
@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this<Comm> {
[[nodiscard]] virtual Result LogTracker(std::string msg) const = 0; [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
[[nodiscard]] virtual Result SignalError(Result const&) { return Success(); } [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const;
}; };
class RabitComm : public Comm { /**
* @brief Base class for CPU-based communicator.
*/
class HostComm : public Comm {
public:
using Comm::Comm;
[[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx,
std::shared_ptr<Coll> pimpl) const = 0;
};
class RabitComm : public HostComm {
std::string nccl_path_ = std::string{DefaultNcclName()};
[[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry, [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
std::string task_id); std::string task_id);
[[nodiscard]] Result Shutdown(); [[nodiscard]] Result Shutdown();
@ -100,13 +112,15 @@ class RabitComm : public Comm {
RabitComm() = default; RabitComm() = default;
// ctor for testing where environment is known. // ctor for testing where environment is known.
RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t retry, std::string task_id); std::int32_t retry, std::string task_id, StringView nccl_path);
~RabitComm() noexcept(false) override; ~RabitComm() noexcept(false) override;
[[nodiscard]] bool IsFederated() const override { return false; } [[nodiscard]] bool IsFederated() const override { return false; }
[[nodiscard]] Result LogTracker(std::string msg) const override; [[nodiscard]] Result LogTracker(std::string msg) const override;
[[nodiscard]] Result SignalError(Result const&) override; [[nodiscard]] Result SignalError(Result const&) override;
[[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
}; };
/** /**

View File

@ -37,7 +37,7 @@ namespace xgboost::collective {
[[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const { [[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
if (device.IsCUDA()) { if (device.IsCUDA()) {
CHECK(ctx->IsCUDA()); CHECK(ctx->IsCUDA());
if (!gpu_comm_) { if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) {
gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_)); gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
} }
return *gpu_comm_; return *gpu_comm_;
@ -55,7 +55,6 @@ CommGroup::CommGroup()
} }
std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"}); std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
std::vector<std::string> keys;
// Try both lower and upper case for compatibility // Try both lower and upper case for compatibility
auto get_param = [&](std::string name, auto dft, auto t) { auto get_param = [&](std::string name, auto dft, auto t) {
std::string upper; std::string upper;
@ -63,8 +62,6 @@ CommGroup::CommGroup()
[](char c) { return std::toupper(c); }); [](char c) { return std::toupper(c); });
std::transform(name.cbegin(), name.cend(), name.begin(), std::transform(name.cbegin(), name.cend(), name.begin(),
[](char c) { return std::tolower(c); }); [](char c) { return std::tolower(c); });
keys.push_back(upper);
keys.push_back(name);
auto const& obj = get<Object const>(config); auto const& obj = get<Object const>(config);
auto it = obj.find(upper); auto it = obj.find(upper);
@ -75,19 +72,19 @@ CommGroup::CommGroup()
} }
}; };
// Common args // Common args
auto retry = auto retry = get_param("dmlc_retry", static_cast<Integer::Int>(DefaultRetry()), Integer{});
OptionalArg<Integer>(config, "dmlc_retry", static_cast<Integer::Int>(DefaultRetry())); auto timeout =
auto timeout = OptionalArg<Integer>(config, "dmlc_timeout_sec", get_param("dmlc_timeout_sec", static_cast<Integer::Int>(DefaultTimeoutSec()), Integer{});
static_cast<Integer::Int>(DefaultTimeoutSec()));
auto task_id = get_param("dmlc_task_id", std::string{}, String{}); auto task_id = get_param("dmlc_task_id", std::string{}, String{});
if (type == "rabit") { if (type == "rabit") {
auto host = get_param("dmlc_tracker_uri", std::string{}, String{}); auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{}); auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
auto ptr = auto ptr =
new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{ // NOLINT new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{ // NOLINT
host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout}, host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
static_cast<std::int32_t>(retry), task_id}}, static_cast<std::int32_t>(retry), task_id, nccl}},
std::shared_ptr<Coll>(new Coll{})}; // NOLINT std::shared_ptr<Coll>(new Coll{})}; // NOLINT
return ptr; return ptr;
} else if (type == "federated") { } else if (type == "federated") {

View File

@ -17,14 +17,16 @@ namespace xgboost::collective {
* collective implementations. * collective implementations.
*/ */
class CommGroup { class CommGroup {
std::shared_ptr<Comm> comm_; std::shared_ptr<HostComm> comm_;
mutable std::shared_ptr<Comm> gpu_comm_; mutable std::shared_ptr<Comm> gpu_comm_;
std::shared_ptr<Coll> backend_; std::shared_ptr<Coll> backend_;
mutable std::shared_ptr<Coll> gpu_coll_; // lazy initialization mutable std::shared_ptr<Coll> gpu_coll_; // lazy initialization
CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll) CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
: comm_{std::move(comm)}, backend_{std::move(coll)} {} : comm_{std::dynamic_pointer_cast<HostComm>(comm)}, backend_{std::move(coll)} {
CHECK(comm_);
}
public: public:
CommGroup(); CommGroup();

View File

@ -3,6 +3,7 @@
*/ */
#include "communicator.h" #include "communicator.h"
#include "comm.h"
#include "in_memory_communicator.h" #include "in_memory_communicator.h"
#include "noop_communicator.h" #include "noop_communicator.h"
#include "rabit_communicator.h" #include "rabit_communicator.h"
@ -14,8 +15,12 @@
namespace xgboost::collective { namespace xgboost::collective {
thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()}; thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
thread_local CommunicatorType Communicator::type_{}; thread_local CommunicatorType Communicator::type_{};
thread_local std::string Communicator::nccl_path_{};
void Communicator::Init(Json const& config) { void Communicator::Init(Json const& config) {
auto nccl = OptionalArg<String>(config, "dmlc_nccl_path", std::string{DefaultNcclName()});
nccl_path_ = nccl;
auto type = GetTypeFromEnv(); auto type = GetTypeFromEnv();
auto const arg = GetTypeFromConfig(config); auto const arg = GetTypeFromConfig(config);
if (arg != CommunicatorType::kUnknown) { if (arg != CommunicatorType::kUnknown) {

View File

@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
#ifdef XGBOOST_USE_NCCL #ifdef XGBOOST_USE_NCCL
switch (type_) { switch (type_) {
case CommunicatorType::kRabit: case CommunicatorType::kRabit:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false)); device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
break; break;
case CommunicatorType::kFederated: case CommunicatorType::kFederated:
case CommunicatorType::kInMemory: case CommunicatorType::kInMemory:
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal)); device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
break; break;
case CommunicatorType::kInMemoryNccl: case CommunicatorType::kInMemoryNccl:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true)); device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_));
break; break;
default: default:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false)); device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
} }
#else #else
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal)); device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));

View File

@ -234,6 +234,7 @@ class Communicator {
static thread_local std::unique_ptr<Communicator> communicator_; static thread_local std::unique_ptr<Communicator> communicator_;
static thread_local CommunicatorType type_; static thread_local CommunicatorType type_;
static thread_local std::string nccl_path_;
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_; static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
#endif #endif

View File

@ -2,12 +2,14 @@
* Copyright 2023 XGBoost contributors * Copyright 2023 XGBoost contributors
*/ */
#if defined(XGBOOST_USE_NCCL) #if defined(XGBOOST_USE_NCCL)
#include "comm.cuh"
#include "nccl_device_communicator.cuh" #include "nccl_device_communicator.cuh"
namespace xgboost { namespace xgboost {
namespace collective { namespace collective {
NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync) NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync,
StringView nccl_path)
: device_ordinal_{device_ordinal}, : device_ordinal_{device_ordinal},
needs_sync_{needs_sync}, needs_sync_{needs_sync},
world_size_{GetWorldSize()}, world_size_{GetWorldSize()},
@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
if (world_size_ == 1) { if (world_size_ == 1) {
return; return;
} }
stub_ = std::make_shared<NcclStub>(std::move(nccl_path));
std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0); std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()}; auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
@ -43,7 +46,9 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
nccl_unique_id_ = GetUniqueId(); nccl_unique_id_ = GetUniqueId();
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_)); auto rc =
GetNCCLResult(stub_, stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
CHECK(rc.OK()) << rc.Report();
} }
NcclDeviceCommunicator::~NcclDeviceCommunicator() { NcclDeviceCommunicator::~NcclDeviceCommunicator() {
@ -51,7 +56,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
return; return;
} }
if (nccl_comm_) { if (nccl_comm_) {
dh::safe_nccl(ncclCommDestroy(nccl_comm_)); auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
CHECK(rc.OK()) << rc.Report();
} }
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
LOG(CONSOLE) << "======== NCCL Statistics========"; LOG(CONSOLE) << "======== NCCL Statistics========";
@ -137,8 +143,10 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
auto *device_buffer = buffer.data().get(); auto *device_buffer = buffer.data().get();
// First gather data from all the workers. // First gather data from all the workers.
dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type), auto rc = GetNCCLResult(
stub_, stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
nccl_comm_, dh::DefaultStream())); nccl_comm_, dh::DefaultStream()));
CHECK(rc.OK()) << rc.Report();
if (needs_sync_) { if (needs_sync_) {
dh::DefaultStream().Sync(); dh::DefaultStream().Sync();
} }
@ -170,9 +178,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
if (IsBitwiseOp(op)) { if (IsBitwiseOp(op)) {
BitwiseAllReduce(send_receive_buffer, count, data_type, op); BitwiseAllReduce(send_receive_buffer, count, data_type, op);
} else { } else {
dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count, auto rc = GetNCCLResult(stub_, stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_, GetNcclDataType(data_type), GetNcclRedOp(op),
dh::DefaultStream())); nccl_comm_, dh::DefaultStream()));
CHECK(rc.OK()) << rc.Report();
} }
allreduce_bytes_ += count * GetTypeSize(data_type); allreduce_bytes_ += count * GetTypeSize(data_type);
allreduce_calls_ += 1; allreduce_calls_ += 1;
@ -185,8 +194,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
} }
dh::safe_cuda(cudaSetDevice(device_ordinal_)); dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_, auto rc = GetNCCLResult(stub_, stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8,
dh::DefaultStream())); nccl_comm_, dh::DefaultStream()));
CHECK(rc.OK()) << rc.Report();
} }
void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes, void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
@ -206,14 +216,19 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
receive_buffer->resize(total_bytes); receive_buffer->resize(total_bytes);
size_t offset = 0; size_t offset = 0;
dh::safe_nccl(ncclGroupStart()); auto rc = Success() << [&] { return GetNCCLResult(stub_, stub_->GroupStart()); } << [&] {
for (int32_t i = 0; i < world_size_; ++i) { for (int32_t i = 0; i < world_size_; ++i) {
size_t as_bytes = segments->at(i); size_t as_bytes = segments->at(i);
dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes, auto rc = GetNCCLResult(
ncclChar, i, nccl_comm_, dh::DefaultStream())); stub_, stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
offset += as_bytes; ncclChar, i, nccl_comm_, dh::DefaultStream()));
} if (!rc.OK()) {
dh::safe_nccl(ncclGroupEnd()); return rc;
}
offset += as_bytes;
}
return Success();
} << [&] { return GetNCCLResult(stub_, stub_->GroupEnd()); };
} }
void NcclDeviceCommunicator::Synchronize() { void NcclDeviceCommunicator::Synchronize() {

View File

@ -4,8 +4,10 @@
#pragma once #pragma once
#include "../common/device_helpers.cuh" #include "../common/device_helpers.cuh"
#include "comm.cuh"
#include "communicator.h" #include "communicator.h"
#include "device_communicator.cuh" #include "device_communicator.cuh"
#include "nccl_stub.h"
namespace xgboost { namespace xgboost {
namespace collective { namespace collective {
@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
* needed. The in-memory communicator is used in tests with multiple threads, each thread * needed. The in-memory communicator is used in tests with multiple threads, each thread
* representing a rank/worker, so the additional synchronization is needed to avoid deadlocks. * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
*/ */
explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync); explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path);
~NcclDeviceCommunicator() override; ~NcclDeviceCommunicator() override;
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type, void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
Operation op) override; Operation op) override;
@ -64,7 +66,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
static const int kRootRank = 0; static const int kRootRank = 0;
ncclUniqueId id; ncclUniqueId id;
if (rank_ == kRootRank) { if (rank_ == kRootRank) {
dh::safe_nccl(ncclGetUniqueId(&id)); auto rc = GetNCCLResult(stub_, stub_->GetUniqueId(&id));
CHECK(rc.OK()) << rc.Report();
} }
Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank)); Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
return id; return id;
@ -78,6 +81,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
int const world_size_; int const world_size_;
int const rank_; int const rank_;
ncclComm_t nccl_comm_{}; ncclComm_t nccl_comm_{};
std::shared_ptr<NcclStub> stub_;
ncclUniqueId nccl_unique_id_{}; ncclUniqueId nccl_unique_id_{};
size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated. size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated.
size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls. size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls.

109
src/collective/nccl_stub.cc Normal file
View File

@ -0,0 +1,109 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#if defined(XGBOOST_USE_NCCL)
#include "nccl_stub.h"
#include <cuda.h> // for CUDA_VERSION
#include <dlfcn.h> // for dlclose, dlsym, dlopen
#include <nccl.h>
#include <cstdint> // for int32_t
#include <sstream> // for stringstream
#include <string> // for string
#include <utility> // for move
#include "xgboost/logging.h"
namespace xgboost::collective {
NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
#if defined(XGBOOST_USE_DLOPEN_NCCL)
CHECK(!path_.empty()) << "Empty path for NCCL.";
auto cu_major = (CUDA_VERSION) / 1000;
std::stringstream ss;
ss << R"m(
If XGBoost is installed from PyPI with pip, the error can fixed by:
- Run `pip install nvidia-nccl-cu)m"
<< cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ").";
ss << R"m(
Otherwise, please refer to:
https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting
for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package
no long bundles NCCL in the binary wheel.
)m";
auto help = ss.str();
std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n "};
auto safe_load = [&](auto t, StringView name) {
std::stringstream errs;
auto ptr = reinterpret_cast<decltype(t)>(dlsym(handle_, name.c_str()));
if (!ptr) {
errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n "
<< dlerror() << help;
LOG(FATAL) << errs.str();
}
return ptr;
};
handle_ = dlopen(path_.c_str(), RTLD_LAZY);
if (!handle_) {
LOG(FATAL) << msg << dlerror() << help;
}
allreduce_ = safe_load(allreduce_, "ncclAllReduce");
broadcast_ = safe_load(broadcast_, "ncclBroadcast");
allgather_ = safe_load(allgather_, "ncclAllGather");
comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
send_ = safe_load(send_, "ncclSend");
recv_ = safe_load(recv_, "ncclRecv");
group_start_ = safe_load(group_start_, "ncclGroupStart");
group_end_ = safe_load(group_end_, "ncclGroupEnd");
get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString");
get_version_ = safe_load(get_version_, "ncclGetVersion");
std::int32_t v;
CHECK_EQ(get_version_(&v), ncclSuccess);
auto patch = v % 100;
auto minor = (v / 100) % 100;
auto major = v / 10000;
LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_
<< "`" << std::endl;
#else
allreduce_ = ncclAllReduce;
broadcast_ = ncclBroadcast;
allgather_ = ncclAllGather;
comm_init_rank_ = ncclCommInitRank;
comm_destroy_ = ncclCommDestroy;
get_uniqueid_ = ncclGetUniqueId;
send_ = ncclSend;
recv_ = ncclRecv;
group_start_ = ncclGroupStart;
group_end_ = ncclGroupEnd;
get_error_string_ = ncclGetErrorString;
get_version_ = ncclGetVersion;
#endif
};
NcclStub::~NcclStub() { // NOLINT
#if defined(XGBOOST_USE_DLOPEN_NCCL)
if (handle_) {
auto rc = dlclose(handle_);
if (rc != 0) {
LOG(WARNING) << "Failed to close NCCL handle:" << dlerror();
}
}
handle_ = nullptr;
#endif // defined(XGBOOST_USE_DLOPEN_NCCL)
}
} // namespace xgboost::collective
#endif // defined(XGBOOST_USE_NCCL)

View File

@ -0,0 +1,94 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#if defined(XGBOOST_USE_NCCL)
#include <cuda_runtime_api.h>
#include <nccl.h>
#include <string> // for string
#include "xgboost/string_view.h" // for StringView
namespace xgboost::collective {
class NcclStub {
#if defined(XGBOOST_USE_DLOPEN_NCCL)
void* handle_{nullptr};
#endif // defined(XGBOOST_USE_DLOPEN_NCCL)
std::string path_;
decltype(ncclAllReduce)* allreduce_{nullptr};
decltype(ncclBroadcast)* broadcast_{nullptr};
decltype(ncclAllGather)* allgather_{nullptr};
decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
decltype(ncclCommDestroy)* comm_destroy_{nullptr};
decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
decltype(ncclSend)* send_{nullptr};
decltype(ncclRecv)* recv_{nullptr};
decltype(ncclGroupStart)* group_start_{nullptr};
decltype(ncclGroupEnd)* group_end_{nullptr};
decltype(ncclGetErrorString)* get_error_string_{nullptr};
decltype(ncclGetVersion)* get_version_{nullptr};
public:
explicit NcclStub(StringView path);
~NcclStub();
[[nodiscard]] ncclResult_t Allreduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream) const {
CHECK(allreduce_);
return this->allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream);
}
[[nodiscard]] ncclResult_t Broadcast(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm,
cudaStream_t stream) const {
CHECK(broadcast_);
return this->broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream);
}
[[nodiscard]] ncclResult_t Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm,
cudaStream_t stream) const {
CHECK(allgather_);
return this->allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream);
}
[[nodiscard]] ncclResult_t CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
int rank) const {
CHECK(comm_init_rank_);
return this->comm_init_rank_(comm, nranks, commId, rank);
}
[[nodiscard]] ncclResult_t CommDestroy(ncclComm_t comm) const {
CHECK(comm_destroy_);
return this->comm_destroy_(comm);
}
[[nodiscard]] ncclResult_t GetUniqueId(ncclUniqueId* uniqueId) const {
CHECK(get_uniqueid_);
return this->get_uniqueid_(uniqueId);
}
[[nodiscard]] ncclResult_t Send(const void* sendbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, cudaStream_t stream) {
CHECK(send_);
return send_(sendbuff, count, datatype, peer, comm, stream);
}
[[nodiscard]] ncclResult_t Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) const {
CHECK(recv_);
return recv_(recvbuff, count, datatype, peer, comm, stream);
}
[[nodiscard]] ncclResult_t GroupStart() const {
CHECK(group_start_);
return group_start_();
}
[[nodiscard]] ncclResult_t GroupEnd() const {
CHECK(group_end_);
return group_end_();
}
[[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
return get_error_string_(result);
}
};
} // namespace xgboost::collective
#endif // defined(XGBOOST_USE_NCCL)

View File

@ -115,30 +115,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT
} }
namespace dh { namespace dh {
#ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
if (code != ncclSuccess) {
std::stringstream ss;
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
ss << " " << file << "(" << line << ")\n";
if (code == ncclUnhandledCudaError) {
// nccl usually preserves the last error so we can get more details.
auto err = cudaPeekAtLastError();
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
} else if (code == ncclSystemError) {
ss << " This might be caused by a network configuration issue. Please consider specifying "
"the network interface for NCCL via environment variables listed in its reference: "
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
}
LOG(FATAL) << ss.str();
}
return code;
}
#endif
inline int32_t CudaGetPointerDevice(void const *ptr) { inline int32_t CudaGetPointerDevice(void const *ptr) {
int32_t device = -1; int32_t device = -1;
cudaPointerAttributes attr; cudaPointerAttributes attr;

View File

@ -21,11 +21,18 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
echo "--- Build libxgboost from the source" echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh $command_wrapper tests/ci_build/build_via_cmake.sh \
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \ -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
-DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ -DUSE_CUDA=ON \
-DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ -DUSE_OPENMP=ON \
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} -DHIDE_CXX_SYMBOLS=ON \
-DPLUGIN_FEDERATED=ON \
-DPLUGIN_RMM=ON \
-DUSE_NCCL=ON \
-DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include \
-DUSE_DLOPEN_NCCL=ON \
${arch_flag}
echo "--- Build binary wheel" echo "--- Build binary wheel"
$command_wrapper bash -c \ $command_wrapper bash -c \
"cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"

View File

@ -21,11 +21,17 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
echo "--- Build libxgboost from the source" echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh $command_wrapper tests/ci_build/build_via_cmake.sh \
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \ -DCMAKE_PREFIX_PATH="/opt/grpc" \
-DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ -DUSE_CUDA=ON \
-DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ -DUSE_OPENMP=ON \
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} -DHIDE_CXX_SYMBOLS=ON \
-DPLUGIN_FEDERATED=ON \
-DUSE_NCCL=ON \
-DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include \
-DUSE_DLOPEN_NCCL=ON \
${arch_flag}
echo "--- Build binary wheel" echo "--- Build binary wheel"
$command_wrapper bash -c \ $command_wrapper bash -c \
"cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"

View File

@ -10,6 +10,7 @@ chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \ tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
--build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
build/testxgboost build/testxgboost
echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"

View File

@ -13,4 +13,5 @@ chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \ tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
--build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
build/testxgboost --gtest_filter=*MGPU* build/testxgboost --gtest_filter=*MGPU*

View File

@ -24,7 +24,8 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "` command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
`"NCCL_VERSION_ARG=$NCCL_VERSION"
# Run specified test suite # Run specified test suite
case "$suite" in case "$suite" in

View File

@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
ARG CUDA_VERSION_ARG ARG CUDA_VERSION_ARG
ARG RAPIDS_VERSION_ARG ARG RAPIDS_VERSION_ARG
ARG NCCL_VERSION_ARG
# Environment # Environment
ENV DEBIAN_FRONTEND noninteractive ENV DEBIAN_FRONTEND noninteractive
@ -23,7 +24,9 @@ RUN \
conda install -c conda-forge mamba && \ conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \ mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \
dask \
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
pyspark>=3.4.0 cloudpickle cuda-python && \ pyspark>=3.4.0 cloudpickle cuda-python && \
mamba clean --all && \ mamba clean --all && \

View File

@ -27,7 +27,7 @@ RUN \
wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y update && \ yum -y update && \
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \ yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm; rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH

View File

@ -1,35 +0,0 @@
#!/usr/bin/env bash
set -e
rm -rf tmp_nccl
mkdir tmp_nccl
pushd tmp_nccl
set -x
cat << EOF > test.cu
int main(void) { return 0; }
EOF
cat << EOF > CMakeLists.txt
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(gencode_extractor CXX C)
cmake_policy(SET CMP0104 NEW)
set(CMAKE_CUDA_HOST_COMPILER \${CMAKE_CXX_COMPILER})
enable_language(CUDA)
include(../cmake/Utils.cmake)
compute_cmake_cuda_archs("")
add_library(test OBJECT test.cu)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
EOF
cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')
nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a
popd
rm -rf tmp_nccl
set +x

View File

@ -1,22 +1,10 @@
import os import os
import sys import sys
from contextlib import contextmanager
@contextmanager
def cd(path):
path = os.path.normpath(path)
cwd = os.getcwd()
os.chdir(path)
print("cd " + path)
try:
yield path
finally:
os.chdir(cwd)
from test_utils import DirectoryExcursion
if len(sys.argv) != 4: if len(sys.argv) != 4:
print('Usage: {} [wheel to rename] [commit id] [platform tag]'.format(sys.argv[0])) print("Usage: {} [wheel to rename] [commit id] [platform tag]".format(sys.argv[0]))
sys.exit(1) sys.exit(1)
@ -26,20 +14,26 @@ platform_tag = sys.argv[3]
dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path) dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)
with cd(dirname): with DirectoryExcursion(dirname):
tokens = basename.split('-') tokens = basename.split("-")
assert len(tokens) == 5 assert len(tokens) == 5
version = tokens[1].split('+')[0] version = tokens[1].split("+")[0]
keywords = {'pkg_name': tokens[0], keywords = {
'version': version, "pkg_name": tokens[0],
'commit_id': commit_id, "version": version,
'platform_tag': platform_tag} "commit_id": commit_id,
new_name = '{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl'.format(**keywords) "platform_tag": platform_tag,
print('Renaming {} to {}...'.format(basename, new_name)) }
new_name = "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format(
**keywords
)
print("Renaming {} to {}...".format(basename, new_name))
if os.path.isfile(new_name): if os.path.isfile(new_name):
os.remove(new_name) os.remove(new_name)
os.rename(basename, new_name) os.rename(basename, new_name)
filesize = os.path.getsize(new_name) / 1024 / 1024 # MB filesize = os.path.getsize(new_name) / 1024 / 1024 # MB
print(f"Wheel size: {filesize}")
msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}" msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
assert filesize <= 300, msg assert filesize <= 300, msg

View File

@ -90,10 +90,10 @@ class Worker : public NCCLWorkerForTest {
} }
}; };
class AllgatherTestGPU : public SocketTest {}; class MGPUAllgatherTest : public SocketTest {};
} // namespace } // namespace
TEST_F(AllgatherTestGPU, MGPUTestVRing) { TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
auto n_workers = common::AllVisibleGPUs(); auto n_workers = common::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) { std::int32_t r) {
@ -104,7 +104,7 @@ TEST_F(AllgatherTestGPU, MGPUTestVRing) {
}); });
} }
TEST_F(AllgatherTestGPU, MGPUTestVBcast) { TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
auto n_workers = common::AllVisibleGPUs(); auto n_workers = common::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) { std::int32_t r) {

View File

@ -5,17 +5,15 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thrust/host_vector.h> // for host_vector #include <thrust/host_vector.h> // for host_vector
#include "../../../src/collective/coll.h" // for Coll
#include "../../../src/common/common.h" #include "../../../src/common/common.h"
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector #include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
#include "../../../src/common/type.h" // for EraseType #include "../../../src/common/type.h" // for EraseType
#include "../helpers.h" // for MakeCUDACtx
#include "test_worker.cuh" // for NCCLWorkerForTest #include "test_worker.cuh" // for NCCLWorkerForTest
#include "test_worker.h" // for WorkerForTest, TestDistributed #include "test_worker.h" // for WorkerForTest, TestDistributed
namespace xgboost::collective { namespace xgboost::collective {
namespace { namespace {
class AllreduceTestGPU : public SocketTest {}; class MGPUAllreduceTest : public SocketTest {};
class Worker : public NCCLWorkerForTest { class Worker : public NCCLWorkerForTest {
public: public:
@ -47,7 +45,7 @@ class Worker : public NCCLWorkerForTest {
}; };
} // namespace } // namespace
TEST_F(AllreduceTestGPU, BitOr) { TEST_F(MGPUAllreduceTest, BitOr) {
auto n_workers = common::AllVisibleGPUs(); auto n_workers = common::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) { std::int32_t r) {
@ -57,7 +55,7 @@ TEST_F(AllreduceTestGPU, BitOr) {
}); });
} }
TEST_F(AllreduceTestGPU, Sum) { TEST_F(MGPUAllreduceTest, Sum) {
auto n_workers = common::AllVisibleGPUs(); auto n_workers = common::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) { std::int32_t r) {

View File

@ -8,6 +8,7 @@
#include <bitset> #include <bitset>
#include <string> // for string #include <string> // for string
#include "../../../src/collective/comm.cuh"
#include "../../../src/collective/communicator-inl.cuh" #include "../../../src/collective/communicator-inl.cuh"
#include "../../../src/collective/nccl_device_communicator.cuh" #include "../../../src/collective/nccl_device_communicator.cuh"
#include "../helpers.h" #include "../helpers.h"
@ -16,17 +17,15 @@ namespace xgboost {
namespace collective { namespace collective {
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) { TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
auto construct = []() { NcclDeviceCommunicator comm{-1, false}; }; auto construct = []() { NcclDeviceCommunicator comm{-1, false, DefaultNcclName()}; };
EXPECT_THROW(construct(), dmlc::Error); EXPECT_THROW(construct(), dmlc::Error);
} }
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) { TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
try { auto stub = std::make_shared<NcclStub>(DefaultNcclName());
dh::safe_nccl(ncclSystemError); auto rc = GetNCCLResult(stub, ncclSystemError);
} catch (dmlc::Error const& e) { auto msg = rc.Report();
auto str = std::string{e.what()}; ASSERT_TRUE(msg.find("environment variables") != std::string::npos);
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
}
} }
namespace { namespace {

View File

@ -33,7 +33,7 @@ class WorkerForTest {
tracker_port_{port}, tracker_port_{port},
world_size_{world}, world_size_{world},
task_id_{"t:" + std::to_string(rank)}, task_id_{"t:" + std::to_string(rank)},
comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} { comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
CHECK_EQ(world_size_, comm_.World()); CHECK_EQ(world_size_, comm_.World());
} }
virtual ~WorkerForTest() = default; virtual ~WorkerForTest() = default;

View File

@ -12,6 +12,7 @@ from hypothesis._settings import duration
import xgboost as xgb import xgboost as xgb
from xgboost import testing as tm from xgboost import testing as tm
from xgboost.collective import CommunicatorContext
from xgboost.testing.params import hist_parameter_strategy from xgboost.testing.params import hist_parameter_strategy
pytestmark = [ pytestmark = [
@ -572,6 +573,65 @@ def test_with_asyncio(local_cuda_client: Client) -> None:
assert isinstance(output["history"], dict) assert isinstance(output["history"], dict)
def test_invalid_nccl(local_cuda_client: Client) -> None:
client = local_cuda_client
workers = tm.get_client_workers(client)
args = client.sync(
dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
)
def run(wid: int) -> None:
ctx = CommunicatorContext(dmlc_nccl_path="foo", **args)
X, y, w = tm.make_regression(n_samples=10, n_features=10, use_cupy=True)
with ctx:
with pytest.raises(ValueError, match=r"pip install"):
xgb.QuantileDMatrix(X, y, weight=w)
futures = client.map(run, range(len(workers)), workers=workers)
client.gather(futures)
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None:
X, y, w = tm.make_regression(128, 16, use_cupy=True)
def make_model() -> None:
xgb.XGBRegressor(
device="cuda",
tree_method=tree_method,
objective="reg:quantileerror",
verbosity=2,
quantile_alpha=[0.2, 0.8],
).fit(X, y, sample_weight=w)
# no nccl load when using single-node.
with tm.captured_output() as (out, err):
make_model()
assert out.getvalue().find("NCCL") == -1
assert err.getvalue().find("NCCL") == -1
client = local_cuda_client
workers = tm.get_client_workers(client)
args = client.sync(
dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
)
# nccl is loaded
def run(wid: int) -> None:
# FIXME(jiamingy): https://github.com/dmlc/xgboost/issues/9147
from xgboost.core import _LIB, _register_log_callback
_register_log_callback(_LIB)
with CommunicatorContext(**args):
with tm.captured_output() as (out, err):
make_model()
assert out.getvalue().find("Loaded shared NCCL") != -1, out.getvalue()
futures = client.map(run, range(len(workers)), workers=workers)
client.gather(futures)
async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT: async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
async with Client(scheduler_address, asynchronous=True) as client: async with Client(scheduler_address, asynchronous=True) as client:
import cupy as cp import cupy as cp