Use dlopen to load NCCL. (#9796)
This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
This commit is contained in:
parent
fedd9674c8
commit
0715ab3c10
@ -69,7 +69,10 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar
|
|||||||
option(USE_CUDA "Build with GPU acceleration" OFF)
|
option(USE_CUDA "Build with GPU acceleration" OFF)
|
||||||
option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
|
option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
|
||||||
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
||||||
|
# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
|
||||||
|
option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
|
||||||
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
|
||||||
|
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS})
|
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS})
|
||||||
set(GPU_COMPUTE_VER "" CACHE STRING
|
set(GPU_COMPUTE_VER "" CACHE STRING
|
||||||
@ -80,6 +83,7 @@ if(USE_CUDA)
|
|||||||
unset(GPU_COMPUTE_VER CACHE)
|
unset(GPU_COMPUTE_VER CACHE)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still
|
# CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still
|
||||||
# be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO
|
# be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO
|
||||||
# is not a supproted configuration
|
# is not a supproted configuration
|
||||||
@ -115,6 +119,12 @@ endif()
|
|||||||
if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
|
if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
|
||||||
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
|
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
|
||||||
endif()
|
endif()
|
||||||
|
if(USE_DLOPEN_NCCL AND (NOT USE_NCCL))
|
||||||
|
message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable USE_DLOPEN_NCCL.")
|
||||||
|
endif()
|
||||||
|
if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
|
||||||
|
message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.")
|
||||||
|
endif()
|
||||||
if(JVM_BINDINGS AND R_LIB)
|
if(JVM_BINDINGS AND R_LIB)
|
||||||
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
|
message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -171,17 +171,24 @@ function(xgboost_set_cuda_flags target)
|
|||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
macro(xgboost_link_nccl target)
|
function(xgboost_link_nccl target)
|
||||||
|
set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1)
|
||||||
|
if(USE_DLOPEN_NCCL)
|
||||||
|
list(APPEND xgboost_nccl_flags -DXGBOOST_USE_DLOPEN_NCCL=1)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(BUILD_STATIC_LIB)
|
if(BUILD_STATIC_LIB)
|
||||||
target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
|
target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
|
||||||
target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1)
|
target_compile_definitions(${target} PUBLIC ${xgboost_nccl_flags})
|
||||||
target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
|
target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
|
||||||
else()
|
else()
|
||||||
target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
|
target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
|
||||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
|
target_compile_definitions(${target} PRIVATE ${xgboost_nccl_flags})
|
||||||
target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
|
if(NOT USE_DLOPEN_NCCL)
|
||||||
|
target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endmacro()
|
endfunction()
|
||||||
|
|
||||||
# compile options
|
# compile options
|
||||||
macro(xgboost_target_properties target)
|
macro(xgboost_target_properties target)
|
||||||
|
|||||||
@ -54,17 +54,24 @@ find_path(NCCL_INCLUDE_DIR
|
|||||||
NAMES nccl.h
|
NAMES nccl.h
|
||||||
HINTS ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include)
|
HINTS ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include)
|
||||||
|
|
||||||
find_library(NCCL_LIBRARY
|
if(USE_DLOPEN_NCCL)
|
||||||
NAMES ${NCCL_LIB_NAME}
|
include(FindPackageHandleStandardArgs)
|
||||||
HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
|
find_package_handle_standard_args(Nccl DEFAULT_MSG NCCL_INCLUDE_DIR)
|
||||||
|
|
||||||
message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
|
mark_as_advanced(NCCL_INCLUDE_DIR)
|
||||||
|
else()
|
||||||
|
find_library(NCCL_LIBRARY
|
||||||
|
NAMES ${NCCL_LIB_NAME}
|
||||||
|
HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
|
||||||
|
|
||||||
include(FindPackageHandleStandardArgs)
|
message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
|
||||||
find_package_handle_standard_args(Nccl DEFAULT_MSG
|
|
||||||
NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
|
||||||
|
|
||||||
mark_as_advanced(
|
include(FindPackageHandleStandardArgs)
|
||||||
NCCL_INCLUDE_DIR
|
find_package_handle_standard_args(Nccl DEFAULT_MSG
|
||||||
NCCL_LIBRARY
|
NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||||
)
|
|
||||||
|
mark_as_advanced(
|
||||||
|
NCCL_INCLUDE_DIR
|
||||||
|
NCCL_LIBRARY
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|||||||
@ -536,6 +536,37 @@ Troubleshooting
|
|||||||
- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
|
- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
|
||||||
that includes `Multiple processes within a communication group ...` upon initialization.
|
that includes `Multiple processes within a communication group ...` upon initialization.
|
||||||
|
|
||||||
|
.. _nccl-load:
|
||||||
|
|
||||||
|
- Starting from version 2.1.0, to reduce the size of the binary wheel, the XGBoost package
|
||||||
|
(installed using pip) loads NCCL from the environment instead of bundling it
|
||||||
|
directly. This means that if you encounter an error message like
|
||||||
|
"Failed to load nccl ...", it indicates that NCCL is not installed or properly
|
||||||
|
configured in your environment.
|
||||||
|
|
||||||
|
To resolve this issue, you can install NCCL using pip:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
pip install nvidia-nccl-cu12 # (or with any compatible CUDA version)
|
||||||
|
|
||||||
|
The default conda installation of XGBoost should not encounter this error. If you are
|
||||||
|
using a customized XGBoost, please make sure one of the followings is true:
|
||||||
|
|
||||||
|
+ XGBoost is NOT compiled with the `USE_DLOPEN_NCCL` flag.
|
||||||
|
+ The `dmlc_nccl_path` parameter is set to full NCCL path when initializing the collective.
|
||||||
|
|
||||||
|
Here are some additional tips for troubleshooting NCCL dependency issues:
|
||||||
|
|
||||||
|
+ Check the NCCL installation path and verify that it's installed correctly. We try to
|
||||||
|
find NCCL by using ``from nvidia.nccl import lib`` in Python when XGBoost is installed
|
||||||
|
using pip.
|
||||||
|
+ Ensure that you have the correct CUDA version installed. NCCL requires a compatible
|
||||||
|
CUDA version to function properly.
|
||||||
|
+ If you are not using distributed training with XGBoost and yet see this error, please
|
||||||
|
open an issue on GitHub.
|
||||||
|
+ If you continue to encounter NCCL dependency issues, please open an issue on GitHub.
|
||||||
|
|
||||||
************
|
************
|
||||||
IPv6 Support
|
IPv6 Support
|
||||||
************
|
************
|
||||||
|
|||||||
@ -1613,6 +1613,8 @@ XGB_DLL int XGTrackerFree(TrackerHandle handle);
|
|||||||
* - DMLC_TRACKER_PORT: Port number of the tracker.
|
* - DMLC_TRACKER_PORT: Port number of the tracker.
|
||||||
* - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
|
* - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
|
||||||
* - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
|
* - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
|
||||||
|
* - dmlc_nccl_path: The path to NCCL shared object. Only used if XGBoost is compiled with
|
||||||
|
* `USE_DLOPEN_NCCL`.
|
||||||
* Only applicable to the Federated communicator (use upper case for environment variables, use
|
* Only applicable to the Federated communicator (use upper case for environment variables, use
|
||||||
* lower case for runtime configuration):
|
* lower case for runtime configuration):
|
||||||
* - federated_server_address: Address of the federated server.
|
* - federated_server_address: Address of the federated server.
|
||||||
|
|||||||
@ -1,23 +1,24 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2021-2023 by XGBoost Contributors
|
* Copyright 2021-2023, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_STRING_VIEW_H_
|
#ifndef XGBOOST_STRING_VIEW_H_
|
||||||
#define XGBOOST_STRING_VIEW_H_
|
#define XGBOOST_STRING_VIEW_H_
|
||||||
#include <xgboost/logging.h> // CHECK_LT
|
#include <xgboost/logging.h> // CHECK_LT
|
||||||
#include <xgboost/span.h> // Span
|
#include <xgboost/span.h> // Span
|
||||||
|
|
||||||
#include <algorithm> // std::equal,std::min
|
#include <algorithm> // for equal, min
|
||||||
#include <iterator> // std::reverse_iterator
|
#include <cstddef> // for size_t
|
||||||
#include <ostream> // std::ostream
|
#include <iterator> // for reverse_iterator
|
||||||
#include <string> // std::char_traits,std::string
|
#include <ostream> // for ostream
|
||||||
|
#include <string> // for char_traits, string
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
struct StringView {
|
struct StringView {
|
||||||
private:
|
private:
|
||||||
using CharT = char; // unsigned char
|
using CharT = char;
|
||||||
using Traits = std::char_traits<CharT>;
|
using Traits = std::char_traits<CharT>;
|
||||||
CharT const* str_{nullptr};
|
CharT const* str_{nullptr};
|
||||||
size_t size_{0};
|
std::size_t size_{0};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
using value_type = CharT; // NOLINT
|
using value_type = CharT; // NOLINT
|
||||||
@ -28,40 +29,41 @@ struct StringView {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
constexpr StringView() = default;
|
constexpr StringView() = default;
|
||||||
constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {}
|
constexpr StringView(value_type const* str, std::size_t size) : str_{str}, size_{size} {}
|
||||||
StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} // NOLINT
|
StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {} // NOLINT
|
||||||
constexpr StringView(CharT const* str) // NOLINT
|
constexpr StringView(value_type const* str) // NOLINT
|
||||||
: str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}
|
: str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}
|
||||||
|
|
||||||
CharT const& operator[](size_t p) const { return str_[p]; }
|
[[nodiscard]] value_type const& operator[](std::size_t p) const { return str_[p]; }
|
||||||
CharT const& at(size_t p) const { // NOLINT
|
[[nodiscard]] explicit operator std::string() const { return {this->c_str(), this->size()}; }
|
||||||
|
[[nodiscard]] value_type const& at(std::size_t p) const { // NOLINT
|
||||||
CHECK_LT(p, size_);
|
CHECK_LT(p, size_);
|
||||||
return str_[p];
|
return str_[p];
|
||||||
}
|
}
|
||||||
constexpr std::size_t size() const { return size_; } // NOLINT
|
[[nodiscard]] constexpr std::size_t size() const { return size_; } // NOLINT
|
||||||
constexpr bool empty() const { return size() == 0; } // NOLINT
|
[[nodiscard]] constexpr bool empty() const { return size() == 0; } // NOLINT
|
||||||
StringView substr(size_t beg, size_t n) const { // NOLINT
|
[[nodiscard]] StringView substr(std::size_t beg, std::size_t n) const { // NOLINT
|
||||||
CHECK_LE(beg, size_);
|
CHECK_LE(beg, size_);
|
||||||
size_t len = std::min(n, size_ - beg);
|
std::size_t len = std::min(n, size_ - beg);
|
||||||
return {str_ + beg, len};
|
return {str_ + beg, len};
|
||||||
}
|
}
|
||||||
CharT const* c_str() const { return str_; } // NOLINT
|
[[nodiscard]] value_type const* c_str() const { return str_; } // NOLINT
|
||||||
|
|
||||||
constexpr CharT const* cbegin() const { return str_; } // NOLINT
|
[[nodiscard]] constexpr const_iterator cbegin() const { return str_; } // NOLINT
|
||||||
constexpr CharT const* cend() const { return str_ + size(); } // NOLINT
|
[[nodiscard]] constexpr const_iterator cend() const { return str_ + size(); } // NOLINT
|
||||||
constexpr CharT const* begin() const { return str_; } // NOLINT
|
[[nodiscard]] constexpr iterator begin() const { return str_; } // NOLINT
|
||||||
constexpr CharT const* end() const { return str_ + size(); } // NOLINT
|
[[nodiscard]] constexpr iterator end() const { return str_ + size(); } // NOLINT
|
||||||
|
|
||||||
const_reverse_iterator rbegin() const noexcept { // NOLINT
|
[[nodiscard]] const_reverse_iterator rbegin() const noexcept { // NOLINT
|
||||||
return const_reverse_iterator(this->end());
|
return const_reverse_iterator(this->end());
|
||||||
}
|
}
|
||||||
const_reverse_iterator crbegin() const noexcept { // NOLINT
|
[[nodiscard]] const_reverse_iterator crbegin() const noexcept { // NOLINT
|
||||||
return const_reverse_iterator(this->end());
|
return const_reverse_iterator(this->end());
|
||||||
}
|
}
|
||||||
const_reverse_iterator rend() const noexcept { // NOLINT
|
[[nodiscard]] const_reverse_iterator rend() const noexcept { // NOLINT
|
||||||
return const_reverse_iterator(this->begin());
|
return const_reverse_iterator(this->begin());
|
||||||
}
|
}
|
||||||
const_reverse_iterator crend() const noexcept { // NOLINT
|
[[nodiscard]] const_reverse_iterator crend() const noexcept { // NOLINT
|
||||||
return const_reverse_iterator(this->begin());
|
return const_reverse_iterator(this->begin());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -103,6 +103,7 @@ if __name__ == "__main__":
|
|||||||
if cli_args.use_cuda == 'ON':
|
if cli_args.use_cuda == 'ON':
|
||||||
CONFIG['USE_CUDA'] = 'ON'
|
CONFIG['USE_CUDA'] = 'ON'
|
||||||
CONFIG['USE_NCCL'] = 'ON'
|
CONFIG['USE_NCCL'] = 'ON'
|
||||||
|
CONFIG["USE_DLOPEN_NCCL"] = "OFF"
|
||||||
|
|
||||||
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
|
args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,11 @@
|
|||||||
|
|
||||||
#include <memory> // for shared_ptr
|
#include <memory> // for shared_ptr
|
||||||
|
|
||||||
|
#include "../../src/collective/coll.h" // for Coll
|
||||||
#include "../../src/common/device_helpers.cuh" // for CUDAStreamView
|
#include "../../src/common/device_helpers.cuh" // for CUDAStreamView
|
||||||
#include "federated_comm.h" // for FederatedComm
|
#include "federated_comm.h" // for FederatedComm
|
||||||
#include "xgboost/context.h" // for Context
|
#include "xgboost/context.h" // for Context
|
||||||
|
#include "xgboost/logging.h"
|
||||||
|
|
||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
class CUDAFederatedComm : public FederatedComm {
|
class CUDAFederatedComm : public FederatedComm {
|
||||||
@ -16,5 +18,9 @@ class CUDAFederatedComm : public FederatedComm {
|
|||||||
public:
|
public:
|
||||||
explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
|
explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
|
||||||
[[nodiscard]] auto Stream() const { return stream_; }
|
[[nodiscard]] auto Stream() const { return stream_; }
|
||||||
|
Comm* MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const override {
|
||||||
|
LOG(FATAL) << "[Internal Error]: Invalid request for CUDA variant.";
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
} // namespace xgboost::collective
|
} // namespace xgboost::collective
|
||||||
|
|||||||
@ -10,12 +10,12 @@
|
|||||||
#include <memory> // for unique_ptr
|
#include <memory> // for unique_ptr
|
||||||
#include <string> // for string
|
#include <string> // for string
|
||||||
|
|
||||||
#include "../../src/collective/comm.h" // for Comm
|
#include "../../src/collective/comm.h" // for HostComm
|
||||||
#include "../../src/common/json_utils.h" // for OptionalArg
|
#include "../../src/common/json_utils.h" // for OptionalArg
|
||||||
#include "xgboost/json.h"
|
#include "xgboost/json.h"
|
||||||
|
|
||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
class FederatedComm : public Comm {
|
class FederatedComm : public HostComm {
|
||||||
std::shared_ptr<federated::Federated::Stub> stub_;
|
std::shared_ptr<federated::Federated::Stub> stub_;
|
||||||
|
|
||||||
void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank,
|
void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank,
|
||||||
@ -64,6 +64,6 @@ class FederatedComm : public Comm {
|
|||||||
[[nodiscard]] bool IsFederated() const override { return true; }
|
[[nodiscard]] bool IsFederated() const override { return true; }
|
||||||
[[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
|
[[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
|
||||||
|
|
||||||
Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
|
[[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
|
||||||
};
|
};
|
||||||
} // namespace xgboost::collective
|
} // namespace xgboost::collective
|
||||||
|
|||||||
@ -15,6 +15,8 @@ class BuildConfiguration: # pylint: disable=R0902
|
|||||||
use_cuda: bool = False
|
use_cuda: bool = False
|
||||||
# Whether to enable NCCL
|
# Whether to enable NCCL
|
||||||
use_nccl: bool = False
|
use_nccl: bool = False
|
||||||
|
# Whether to load nccl dynamically
|
||||||
|
use_dlopen_nccl: bool = False
|
||||||
# Whether to enable HDFS
|
# Whether to enable HDFS
|
||||||
use_hdfs: bool = False
|
use_hdfs: bool = False
|
||||||
# Whether to enable Azure Storage
|
# Whether to enable Azure Storage
|
||||||
|
|||||||
@ -29,7 +29,8 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"numpy",
|
"numpy",
|
||||||
"scipy"
|
"scipy",
|
||||||
|
"nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
|||||||
@ -2,14 +2,15 @@
|
|||||||
import ctypes
|
import ctypes
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from enum import IntEnum, unique
|
from enum import IntEnum, unique
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ._typing import _T
|
from ._typing import _T
|
||||||
from .core import _LIB, _check_call, c_str, from_pystr_to_cstr, py_str
|
from .core import _LIB, _check_call, build_info, c_str, from_pystr_to_cstr, py_str
|
||||||
|
|
||||||
LOGGER = logging.getLogger("[xgboost.collective]")
|
LOGGER = logging.getLogger("[xgboost.collective]")
|
||||||
|
|
||||||
@ -250,6 +251,31 @@ class CommunicatorContext:
|
|||||||
|
|
||||||
def __init__(self, **args: Any) -> None:
|
def __init__(self, **args: Any) -> None:
|
||||||
self.args = args
|
self.args = args
|
||||||
|
key = "dmlc_nccl_path"
|
||||||
|
if args.get(key, None) is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
binfo = build_info()
|
||||||
|
if not binfo["USE_DLOPEN_NCCL"]:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# PyPI package of NCCL.
|
||||||
|
from nvidia.nccl import lib
|
||||||
|
|
||||||
|
# There are two versions of nvidia-nccl, one is from PyPI, another one from
|
||||||
|
# nvidia-pyindex. We support only the first one as the second one is too old
|
||||||
|
# (2.9.8 as of writing).
|
||||||
|
if lib.__file__ is not None:
|
||||||
|
dirname: Optional[str] = os.path.dirname(lib.__file__)
|
||||||
|
else:
|
||||||
|
dirname = None
|
||||||
|
|
||||||
|
if dirname:
|
||||||
|
path = os.path.join(dirname, "libnccl.so.2")
|
||||||
|
self.args[key] = path
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
def __enter__(self) -> Dict[str, Any]:
|
def __enter__(self) -> Dict[str, Any]:
|
||||||
init(**self.args)
|
init(**self.args)
|
||||||
|
|||||||
@ -184,6 +184,13 @@ def _py_version() -> str:
|
|||||||
return f.read().strip()
|
return f.read().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _register_log_callback(lib: ctypes.CDLL) -> None:
|
||||||
|
lib.XGBGetLastError.restype = ctypes.c_char_p
|
||||||
|
lib.callback = _get_log_callback_func() # type: ignore
|
||||||
|
if lib.XGBRegisterLogCallback(lib.callback) != 0:
|
||||||
|
raise XGBoostError(lib.XGBGetLastError())
|
||||||
|
|
||||||
|
|
||||||
def _load_lib() -> ctypes.CDLL:
|
def _load_lib() -> ctypes.CDLL:
|
||||||
"""Load xgboost Library."""
|
"""Load xgboost Library."""
|
||||||
lib_paths = find_lib_path()
|
lib_paths = find_lib_path()
|
||||||
@ -228,10 +235,7 @@ Likely causes:
|
|||||||
Error message(s): {os_error_list}
|
Error message(s): {os_error_list}
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
lib.XGBGetLastError.restype = ctypes.c_char_p
|
_register_log_callback(lib)
|
||||||
lib.callback = _get_log_callback_func() # type: ignore
|
|
||||||
if lib.XGBRegisterLogCallback(lib.callback) != 0:
|
|
||||||
raise XGBoostError(lib.XGBGetLastError())
|
|
||||||
|
|
||||||
def parse(ver: str) -> Tuple[int, int, int]:
|
def parse(ver: str) -> Tuple[int, int, int]:
|
||||||
"""Avoid dependency on packaging (PEP 440)."""
|
"""Avoid dependency on packaging (PEP 440)."""
|
||||||
|
|||||||
@ -7,8 +7,6 @@
|
|||||||
#include <cinttypes> // for strtoimax
|
#include <cinttypes> // for strtoimax
|
||||||
#include <cmath> // for nan
|
#include <cmath> // for nan
|
||||||
#include <cstring> // for strcmp
|
#include <cstring> // for strcmp
|
||||||
#include <fstream> // for operator<<, basic_ostream, ios, stringstream
|
|
||||||
#include <functional> // for less
|
|
||||||
#include <limits> // for numeric_limits
|
#include <limits> // for numeric_limits
|
||||||
#include <map> // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
|
#include <map> // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
|
||||||
#include <memory> // for shared_ptr, allocator, __shared_ptr_access
|
#include <memory> // for shared_ptr, allocator, __shared_ptr_access
|
||||||
@ -22,7 +20,6 @@
|
|||||||
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
|
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
|
||||||
#include "../common/hist_util.h" // for HistogramCuts
|
#include "../common/hist_util.h" // for HistogramCuts
|
||||||
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
|
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
|
||||||
#include "../common/linalg_op.h" // for ElementWiseTransformHost
|
|
||||||
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
|
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
|
||||||
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
|
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
|
||||||
#include "../data/ellpack_page.h" // for EllpackPage
|
#include "../data/ellpack_page.h" // for EllpackPage
|
||||||
@ -35,14 +32,12 @@
|
|||||||
#include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager
|
#include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager
|
||||||
#include "dmlc/thread_local.h" // for ThreadLocalStore
|
#include "dmlc/thread_local.h" // for ThreadLocalStore
|
||||||
#include "rabit/c_api.h" // for RabitLinkTag
|
#include "rabit/c_api.h" // for RabitLinkTag
|
||||||
#include "rabit/rabit.h" // for CheckPoint, LoadCheckPoint
|
|
||||||
#include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat...
|
#include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat...
|
||||||
#include "xgboost/context.h" // for Context
|
#include "xgboost/context.h" // for Context
|
||||||
#include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage
|
#include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage
|
||||||
#include "xgboost/feature_map.h" // for FeatureMap
|
#include "xgboost/feature_map.h" // for FeatureMap
|
||||||
#include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal...
|
#include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal...
|
||||||
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
||||||
#include "xgboost/intrusive_ptr.h" // for xgboost
|
|
||||||
#include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String
|
#include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String
|
||||||
#include "xgboost/learner.h" // for Learner, PredictionType
|
#include "xgboost/learner.h" // for Learner, PredictionType
|
||||||
#include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
|
#include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
|
||||||
@ -79,6 +74,7 @@ void XGBBuildInfoDevice(Json *p_info) {
|
|||||||
info["USE_CUDA"] = Boolean{false};
|
info["USE_CUDA"] = Boolean{false};
|
||||||
info["USE_NCCL"] = Boolean{false};
|
info["USE_NCCL"] = Boolean{false};
|
||||||
info["USE_RMM"] = Boolean{false};
|
info["USE_RMM"] = Boolean{false};
|
||||||
|
info["USE_DLOPEN_NCCL"] = Boolean{false};
|
||||||
}
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -33,8 +33,16 @@ void XGBBuildInfoDevice(Json *p_info) {
|
|||||||
info["USE_NCCL"] = Boolean{true};
|
info["USE_NCCL"] = Boolean{true};
|
||||||
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
|
v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
|
||||||
info["NCCL_VERSION"] = v;
|
info["NCCL_VERSION"] = v;
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
info["USE_DLOPEN_NCCL"] = Boolean{true};
|
||||||
|
#else
|
||||||
|
info["USE_DLOPEN_NCCL"] = Boolean{false};
|
||||||
|
#endif // defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
info["USE_NCCL"] = Boolean{false};
|
info["USE_NCCL"] = Boolean{false};
|
||||||
|
info["USE_DLOPEN_NCCL"] = Boolean{false};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_RMM)
|
#if defined(XGBOOST_USE_RMM)
|
||||||
|
|||||||
@ -19,25 +19,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; }
|
|||||||
|
|
||||||
NCCLColl::~NCCLColl() = default;
|
NCCLColl::~NCCLColl() = default;
|
||||||
namespace {
|
namespace {
|
||||||
Result GetNCCLResult(ncclResult_t code) {
|
|
||||||
if (code == ncclSuccess) {
|
|
||||||
return Success();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
|
|
||||||
if (code == ncclUnhandledCudaError) {
|
|
||||||
// nccl usually preserves the last error so we can get more details.
|
|
||||||
auto err = cudaPeekAtLastError();
|
|
||||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
|
||||||
} else if (code == ncclSystemError) {
|
|
||||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
|
||||||
"the network interface for NCCL via environment variables listed in its reference: "
|
|
||||||
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
|
||||||
}
|
|
||||||
return Fail(ss.str());
|
|
||||||
}
|
|
||||||
|
|
||||||
auto GetNCCLType(ArrayInterfaceHandler::Type type) {
|
auto GetNCCLType(ArrayInterfaceHandler::Type type) {
|
||||||
auto fatal = [] {
|
auto fatal = [] {
|
||||||
LOG(FATAL) << "Invalid type for NCCL operation.";
|
LOG(FATAL) << "Invalid type for NCCL operation.";
|
||||||
@ -94,11 +75,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
|
|||||||
common::Span<std::int8_t> data, Op op) {
|
common::Span<std::int8_t> data, Op op) {
|
||||||
dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
|
dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
|
||||||
auto* device_buffer = buffer.data().get();
|
auto* device_buffer = buffer.data().get();
|
||||||
|
auto stub = pcomm->Stub();
|
||||||
|
|
||||||
// First gather data from all the workers.
|
// First gather data from all the workers.
|
||||||
CHECK(handle);
|
CHECK(handle);
|
||||||
auto rc = GetNCCLResult(
|
auto rc = GetNCCLResult(stub, stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8,
|
||||||
ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream()));
|
handle, pcomm->Stream()));
|
||||||
if (!rc.OK()) {
|
if (!rc.OK()) {
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -149,6 +131,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
|
|||||||
}
|
}
|
||||||
auto nccl = dynamic_cast<NCCLComm const*>(&comm);
|
auto nccl = dynamic_cast<NCCLComm const*>(&comm);
|
||||||
CHECK(nccl);
|
CHECK(nccl);
|
||||||
|
auto stub = nccl->Stub();
|
||||||
|
|
||||||
return Success() << [&] {
|
return Success() << [&] {
|
||||||
if (IsBitwiseOp(op)) {
|
if (IsBitwiseOp(op)) {
|
||||||
return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
|
return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
|
||||||
@ -156,9 +140,9 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
|
|||||||
return DispatchDType(type, [=](auto t) {
|
return DispatchDType(type, [=](auto t) {
|
||||||
using T = decltype(t);
|
using T = decltype(t);
|
||||||
auto rdata = common::RestoreType<T>(data);
|
auto rdata = common::RestoreType<T>(data);
|
||||||
auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
|
auto rc = stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
|
||||||
GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
|
GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
|
||||||
return GetNCCLResult(rc);
|
return GetNCCLResult(stub, rc);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} << [&] { return nccl->Block(); };
|
} << [&] { return nccl->Block(); };
|
||||||
@ -171,9 +155,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
|
|||||||
}
|
}
|
||||||
auto nccl = dynamic_cast<NCCLComm const*>(&comm);
|
auto nccl = dynamic_cast<NCCLComm const*>(&comm);
|
||||||
CHECK(nccl);
|
CHECK(nccl);
|
||||||
|
auto stub = nccl->Stub();
|
||||||
|
|
||||||
return Success() << [&] {
|
return Success() << [&] {
|
||||||
return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
|
return GetNCCLResult(stub, stub->Broadcast(data.data(), data.data(), data.size_bytes(),
|
||||||
nccl->Handle(), nccl->Stream()));
|
ncclInt8, root, nccl->Handle(), nccl->Stream()));
|
||||||
} << [&] { return nccl->Block(); };
|
} << [&] { return nccl->Block(); };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,10 +170,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
|
|||||||
}
|
}
|
||||||
auto nccl = dynamic_cast<NCCLComm const*>(&comm);
|
auto nccl = dynamic_cast<NCCLComm const*>(&comm);
|
||||||
CHECK(nccl);
|
CHECK(nccl);
|
||||||
|
auto stub = nccl->Stub();
|
||||||
|
|
||||||
auto send = data.subspan(comm.Rank() * size, size);
|
auto send = data.subspan(comm.Rank() * size, size);
|
||||||
return Success() << [&] {
|
return Success() << [&] {
|
||||||
return GetNCCLResult(
|
return GetNCCLResult(stub, stub->Allgather(send.data(), data.data(), size, ncclInt8,
|
||||||
ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream()));
|
nccl->Handle(), nccl->Stream()));
|
||||||
} << [&] { return nccl->Block(); };
|
} << [&] { return nccl->Block(); };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,19 +187,20 @@ namespace cuda_impl {
|
|||||||
*/
|
*/
|
||||||
Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
|
Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
|
||||||
common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
|
common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
|
||||||
return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
|
auto stub = comm->Stub();
|
||||||
|
return Success() << [&stub] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
|
||||||
std::size_t offset = 0;
|
std::size_t offset = 0;
|
||||||
for (std::int32_t r = 0; r < comm->World(); ++r) {
|
for (std::int32_t r = 0; r < comm->World(); ++r) {
|
||||||
auto as_bytes = sizes[r];
|
auto as_bytes = sizes[r];
|
||||||
auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
|
auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
|
||||||
ncclInt8, r, comm->Handle(), dh::DefaultStream());
|
ncclInt8, r, comm->Handle(), dh::DefaultStream());
|
||||||
if (rc != ncclSuccess) {
|
if (rc != ncclSuccess) {
|
||||||
return GetNCCLResult(rc);
|
return GetNCCLResult(stub, rc);
|
||||||
}
|
}
|
||||||
offset += as_bytes;
|
offset += as_bytes;
|
||||||
}
|
}
|
||||||
return Success();
|
return Success();
|
||||||
} << [] { return GetNCCLResult(ncclGroupEnd()); };
|
} << [&] { return GetNCCLResult(stub, stub->GroupEnd()); };
|
||||||
}
|
}
|
||||||
} // namespace cuda_impl
|
} // namespace cuda_impl
|
||||||
|
|
||||||
@ -224,10 +213,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
|
|||||||
if (!comm.IsDistributed()) {
|
if (!comm.IsDistributed()) {
|
||||||
return Success();
|
return Success();
|
||||||
}
|
}
|
||||||
|
auto stub = nccl->Stub();
|
||||||
|
|
||||||
switch (algo) {
|
switch (algo) {
|
||||||
case AllgatherVAlgo::kRing: {
|
case AllgatherVAlgo::kRing: {
|
||||||
return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
|
return Success() << [&] { return GetNCCLResult(stub, stub->GroupStart()); } << [&] {
|
||||||
// get worker offset
|
// get worker offset
|
||||||
detail::AllgatherVOffset(sizes, recv_segments);
|
detail::AllgatherVOffset(sizes, recv_segments);
|
||||||
// copy data
|
// copy data
|
||||||
@ -237,8 +227,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
|
|||||||
cudaMemcpyDeviceToDevice, nccl->Stream()));
|
cudaMemcpyDeviceToDevice, nccl->Stream()));
|
||||||
}
|
}
|
||||||
return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
|
return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
|
||||||
} << [] {
|
} << [&] {
|
||||||
return GetNCCLResult(ncclGroupEnd());
|
return GetNCCLResult(stub, stub->GroupEnd());
|
||||||
} << [&] { return nccl->Block(); };
|
} << [&] { return nccl->Block(); };
|
||||||
}
|
}
|
||||||
case AllgatherVAlgo::kBcast: {
|
case AllgatherVAlgo::kBcast: {
|
||||||
|
|||||||
@ -8,7 +8,8 @@
|
|||||||
#include "../data/array_interface.h" // for ArrayInterfaceHandler
|
#include "../data/array_interface.h" // for ArrayInterfaceHandler
|
||||||
#include "coll.h" // for Coll
|
#include "coll.h" // for Coll
|
||||||
#include "comm.h" // for Comm
|
#include "comm.h" // for Comm
|
||||||
#include "xgboost/span.h" // for Span
|
#include "nccl_stub.h"
|
||||||
|
#include "xgboost/span.h" // for Span
|
||||||
|
|
||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
class NCCLColl : public Coll {
|
class NCCLColl : public Coll {
|
||||||
|
|||||||
@ -7,15 +7,12 @@
|
|||||||
#include <chrono> // for seconds
|
#include <chrono> // for seconds
|
||||||
#include <cstdlib> // for exit
|
#include <cstdlib> // for exit
|
||||||
#include <memory> // for shared_ptr
|
#include <memory> // for shared_ptr
|
||||||
#include <mutex> // for unique_lock
|
|
||||||
#include <string> // for string
|
#include <string> // for string
|
||||||
#include <utility> // for move, forward
|
#include <utility> // for move, forward
|
||||||
|
|
||||||
#include "../common/common.h" // for AssertGPUSupport
|
#include "../common/common.h" // for AssertGPUSupport
|
||||||
#include "../common/json_utils.h" // for OptionalArg
|
|
||||||
#include "allgather.h" // for RingAllgather
|
#include "allgather.h" // for RingAllgather
|
||||||
#include "protocol.h" // for kMagic
|
#include "protocol.h" // for kMagic
|
||||||
#include "tracker.h" // for GetHostAddress
|
|
||||||
#include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE
|
#include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE
|
||||||
#include "xgboost/collective/socket.h" // for TCPSocket
|
#include "xgboost/collective/socket.h" // for TCPSocket
|
||||||
#include "xgboost/json.h" // for Json, Object
|
#include "xgboost/json.h" // for Json, Object
|
||||||
@ -62,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
|
|||||||
this->Rank(), this->World());
|
this->Rank(), this->World());
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(XGBOOST_USE_NCCL)
|
|
||||||
Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
|
|
||||||
common::AssertGPUSupport();
|
|
||||||
common::AssertNCCLSupport();
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
#endif // !defined(XGBOOST_USE_NCCL)
|
|
||||||
|
|
||||||
[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
|
[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
|
||||||
proto::PeerInfo ninfo, std::chrono::seconds timeout,
|
proto::PeerInfo ninfo, std::chrono::seconds timeout,
|
||||||
std::int32_t retry,
|
std::int32_t retry,
|
||||||
@ -194,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
||||||
std::int32_t retry, std::string task_id)
|
std::int32_t retry, std::string task_id, StringView nccl_path)
|
||||||
: Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
|
: HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
|
||||||
|
nccl_path_{std::move(nccl_path)} {
|
||||||
auto rc = this->Bootstrap(timeout_, retry_, task_id_);
|
auto rc = this->Bootstrap(timeout_, retry_, task_id_);
|
||||||
CHECK(rc.OK()) << rc.Report();
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if !defined(XGBOOST_USE_NCCL)
|
||||||
|
Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
|
||||||
|
common::AssertGPUSupport();
|
||||||
|
common::AssertNCCLSupport();
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
#endif // !defined(XGBOOST_USE_NCCL)
|
||||||
|
|
||||||
[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
|
[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
|
||||||
std::string task_id) {
|
std::string task_id) {
|
||||||
TCPSocket tracker;
|
TCPSocket tracker;
|
||||||
|
|||||||
@ -13,19 +13,21 @@
|
|||||||
#include "../common/cuda_context.cuh" // for CUDAContext
|
#include "../common/cuda_context.cuh" // for CUDAContext
|
||||||
#include "../common/device_helpers.cuh" // for DefaultStream
|
#include "../common/device_helpers.cuh" // for DefaultStream
|
||||||
#include "../common/type.h" // for EraseType
|
#include "../common/type.h" // for EraseType
|
||||||
#include "broadcast.h" // for Broadcast
|
|
||||||
#include "comm.cuh" // for NCCLComm
|
#include "comm.cuh" // for NCCLComm
|
||||||
#include "comm.h" // for Comm
|
#include "comm.h" // for Comm
|
||||||
|
#include "nccl_stub.h" // for NcclStub
|
||||||
#include "xgboost/collective/result.h" // for Result
|
#include "xgboost/collective/result.h" // for Result
|
||||||
#include "xgboost/span.h" // for Span
|
#include "xgboost/span.h" // for Span
|
||||||
|
|
||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
namespace {
|
namespace {
|
||||||
Result GetUniqueId(Comm const& comm, std::shared_ptr<Coll> coll, ncclUniqueId* pid) {
|
Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared_ptr<Coll> coll,
|
||||||
|
ncclUniqueId* pid) {
|
||||||
static const int kRootRank = 0;
|
static const int kRootRank = 0;
|
||||||
ncclUniqueId id;
|
ncclUniqueId id;
|
||||||
if (comm.Rank() == kRootRank) {
|
if (comm.Rank() == kRootRank) {
|
||||||
dh::safe_nccl(ncclGetUniqueId(&id));
|
auto rc = GetNCCLResult(stub, stub->GetUniqueId(&id));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
auto rc = coll->Broadcast(
|
auto rc = coll->Broadcast(
|
||||||
comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
|
comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
|
||||||
@ -54,11 +56,12 @@ static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> c
|
|||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
|
Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
|
||||||
return new NCCLComm{ctx, *this, pimpl};
|
return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}};
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
|
NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
|
||||||
|
StringView nccl_path)
|
||||||
: Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
|
: Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
|
||||||
root.TaskID()},
|
root.TaskID()},
|
||||||
stream_{ctx->CUDACtx()->Stream()} {
|
stream_{ctx->CUDACtx()->Stream()} {
|
||||||
@ -70,6 +73,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
|
|||||||
}
|
}
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||||
|
stub_ = std::make_shared<NcclStub>(nccl_path);
|
||||||
|
|
||||||
std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
|
std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
|
||||||
auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
|
auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
|
||||||
@ -95,19 +99,24 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
|
|||||||
<< "Multiple processes within communication group running on same CUDA "
|
<< "Multiple processes within communication group running on same CUDA "
|
||||||
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
|
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
|
||||||
|
|
||||||
rc = GetUniqueId(root, pimpl, &nccl_unique_id_);
|
rc = std::move(rc) << [&] {
|
||||||
|
return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);
|
||||||
|
} << [&] {
|
||||||
|
return GetNCCLResult(this->stub_, this->stub_->CommInitRank(&nccl_comm_, root.World(),
|
||||||
|
nccl_unique_id_, root.Rank()));
|
||||||
|
};
|
||||||
CHECK(rc.OK()) << rc.Report();
|
CHECK(rc.OK()) << rc.Report();
|
||||||
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank()));
|
|
||||||
|
|
||||||
for (std::int32_t r = 0; r < root.World(); ++r) {
|
for (std::int32_t r = 0; r < root.World(); ++r) {
|
||||||
this->channels_.emplace_back(
|
this->channels_.emplace_back(
|
||||||
std::make_shared<NCCLChannel>(root, r, nccl_comm_, dh::DefaultStream()));
|
std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NCCLComm::~NCCLComm() {
|
NCCLComm::~NCCLComm() {
|
||||||
if (nccl_comm_) {
|
if (nccl_comm_) {
|
||||||
dh::safe_nccl(ncclCommDestroy(nccl_comm_));
|
auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace xgboost::collective
|
} // namespace xgboost::collective
|
||||||
|
|||||||
@ -6,9 +6,13 @@
|
|||||||
#ifdef XGBOOST_USE_NCCL
|
#ifdef XGBOOST_USE_NCCL
|
||||||
#include "nccl.h"
|
#include "nccl.h"
|
||||||
#endif // XGBOOST_USE_NCCL
|
#endif // XGBOOST_USE_NCCL
|
||||||
|
|
||||||
|
#include <utility> // for move
|
||||||
|
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
#include "coll.h"
|
#include "coll.h"
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
|
#include "nccl_stub.h" // for NcclStub
|
||||||
#include "xgboost/context.h"
|
#include "xgboost/context.h"
|
||||||
|
|
||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
@ -21,15 +25,20 @@ inline Result GetCUDAResult(cudaError rc) {
|
|||||||
return Fail(msg);
|
return Fail(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
class NCCLComm : public Comm {
|
class NCCLComm : public Comm {
|
||||||
ncclComm_t nccl_comm_{nullptr};
|
ncclComm_t nccl_comm_{nullptr};
|
||||||
|
std::shared_ptr<NcclStub> stub_;
|
||||||
ncclUniqueId nccl_unique_id_{};
|
ncclUniqueId nccl_unique_id_{};
|
||||||
dh::CUDAStreamView stream_;
|
dh::CUDAStreamView stream_;
|
||||||
|
std::string nccl_path_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
[[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
|
[[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
|
||||||
|
auto Stub() const { return stub_; }
|
||||||
|
|
||||||
explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl);
|
explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
|
||||||
|
StringView nccl_path);
|
||||||
[[nodiscard]] Result LogTracker(std::string) const override {
|
[[nodiscard]] Result LogTracker(std::string) const override {
|
||||||
LOG(FATAL) << "Device comm is used for logging.";
|
LOG(FATAL) << "Device comm is used for logging.";
|
||||||
return Fail("Undefined.");
|
return Fail("Undefined.");
|
||||||
@ -43,25 +52,53 @@ class NCCLComm : public Comm {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline Result GetNCCLResult(std::shared_ptr<NcclStub> stub, ncclResult_t code) {
|
||||||
|
if (code == ncclSuccess) {
|
||||||
|
return Success();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "NCCL failure: " << stub->GetErrorString(code) << ".";
|
||||||
|
if (code == ncclUnhandledCudaError) {
|
||||||
|
// nccl usually preserves the last error so we can get more details.
|
||||||
|
auto err = cudaPeekAtLastError();
|
||||||
|
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
||||||
|
} else if (code == ncclSystemError) {
|
||||||
|
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
||||||
|
"the network interface for NCCL via environment variables listed in its reference: "
|
||||||
|
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
||||||
|
}
|
||||||
|
return Fail(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
class NCCLChannel : public Channel {
|
class NCCLChannel : public Channel {
|
||||||
std::int32_t rank_{-1};
|
std::int32_t rank_{-1};
|
||||||
ncclComm_t nccl_comm_{};
|
ncclComm_t nccl_comm_{};
|
||||||
|
std::shared_ptr<NcclStub> stub_;
|
||||||
dh::CUDAStreamView stream_;
|
dh::CUDAStreamView stream_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
|
explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
|
||||||
dh::CUDAStreamView stream)
|
std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
|
||||||
: rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {}
|
: rank_{rank},
|
||||||
|
nccl_comm_{nccl_comm},
|
||||||
|
stub_{std::move(stub)},
|
||||||
|
Channel{comm, nullptr},
|
||||||
|
stream_{stream} {}
|
||||||
|
|
||||||
void SendAll(std::int8_t const* ptr, std::size_t n) override {
|
void SendAll(std::int8_t const* ptr, std::size_t n) override {
|
||||||
dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
|
auto rc = GetNCCLResult(stub_, stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
void RecvAll(std::int8_t* ptr, std::size_t n) override {
|
void RecvAll(std::int8_t* ptr, std::size_t n) override {
|
||||||
dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
|
auto rc = GetNCCLResult(stub_, stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
[[nodiscard]] Result Block() override {
|
[[nodiscard]] Result Block() override {
|
||||||
auto rc = stream_.Sync(false);
|
auto rc = stream_.Sync(false);
|
||||||
return GetCUDAResult(rc);
|
return GetCUDAResult(rc);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // defined(XGBOOST_USE_NCCL)
|
||||||
} // namespace xgboost::collective
|
} // namespace xgboost::collective
|
||||||
|
|||||||
@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
|
|||||||
return nrank;
|
return nrank;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline StringView DefaultNcclName() { return "libnccl.so.2"; }
|
||||||
|
|
||||||
class Channel;
|
class Channel;
|
||||||
class Coll;
|
class Coll;
|
||||||
|
|
||||||
@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this<Comm> {
|
|||||||
[[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
|
[[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
|
||||||
|
|
||||||
[[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
|
[[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
|
||||||
|
|
||||||
virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class RabitComm : public Comm {
|
/**
|
||||||
|
* @brief Base class for CPU-based communicator.
|
||||||
|
*/
|
||||||
|
class HostComm : public Comm {
|
||||||
|
public:
|
||||||
|
using Comm::Comm;
|
||||||
|
[[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx,
|
||||||
|
std::shared_ptr<Coll> pimpl) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RabitComm : public HostComm {
|
||||||
|
std::string nccl_path_ = std::string{DefaultNcclName()};
|
||||||
|
|
||||||
[[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
|
[[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
|
||||||
std::string task_id);
|
std::string task_id);
|
||||||
[[nodiscard]] Result Shutdown();
|
[[nodiscard]] Result Shutdown();
|
||||||
@ -100,13 +112,15 @@ class RabitComm : public Comm {
|
|||||||
RabitComm() = default;
|
RabitComm() = default;
|
||||||
// ctor for testing where environment is known.
|
// ctor for testing where environment is known.
|
||||||
RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
||||||
std::int32_t retry, std::string task_id);
|
std::int32_t retry, std::string task_id, StringView nccl_path);
|
||||||
~RabitComm() noexcept(false) override;
|
~RabitComm() noexcept(false) override;
|
||||||
|
|
||||||
[[nodiscard]] bool IsFederated() const override { return false; }
|
[[nodiscard]] bool IsFederated() const override { return false; }
|
||||||
[[nodiscard]] Result LogTracker(std::string msg) const override;
|
[[nodiscard]] Result LogTracker(std::string msg) const override;
|
||||||
|
|
||||||
[[nodiscard]] Result SignalError(Result const&) override;
|
[[nodiscard]] Result SignalError(Result const&) override;
|
||||||
|
|
||||||
|
[[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -37,7 +37,7 @@ namespace xgboost::collective {
|
|||||||
[[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
|
[[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
|
||||||
if (device.IsCUDA()) {
|
if (device.IsCUDA()) {
|
||||||
CHECK(ctx->IsCUDA());
|
CHECK(ctx->IsCUDA());
|
||||||
if (!gpu_comm_) {
|
if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) {
|
||||||
gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
|
gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
|
||||||
}
|
}
|
||||||
return *gpu_comm_;
|
return *gpu_comm_;
|
||||||
@ -55,7 +55,6 @@ CommGroup::CommGroup()
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
|
std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
|
||||||
std::vector<std::string> keys;
|
|
||||||
// Try both lower and upper case for compatibility
|
// Try both lower and upper case for compatibility
|
||||||
auto get_param = [&](std::string name, auto dft, auto t) {
|
auto get_param = [&](std::string name, auto dft, auto t) {
|
||||||
std::string upper;
|
std::string upper;
|
||||||
@ -63,8 +62,6 @@ CommGroup::CommGroup()
|
|||||||
[](char c) { return std::toupper(c); });
|
[](char c) { return std::toupper(c); });
|
||||||
std::transform(name.cbegin(), name.cend(), name.begin(),
|
std::transform(name.cbegin(), name.cend(), name.begin(),
|
||||||
[](char c) { return std::tolower(c); });
|
[](char c) { return std::tolower(c); });
|
||||||
keys.push_back(upper);
|
|
||||||
keys.push_back(name);
|
|
||||||
|
|
||||||
auto const& obj = get<Object const>(config);
|
auto const& obj = get<Object const>(config);
|
||||||
auto it = obj.find(upper);
|
auto it = obj.find(upper);
|
||||||
@ -75,19 +72,19 @@ CommGroup::CommGroup()
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
// Common args
|
// Common args
|
||||||
auto retry =
|
auto retry = get_param("dmlc_retry", static_cast<Integer::Int>(DefaultRetry()), Integer{});
|
||||||
OptionalArg<Integer>(config, "dmlc_retry", static_cast<Integer::Int>(DefaultRetry()));
|
auto timeout =
|
||||||
auto timeout = OptionalArg<Integer>(config, "dmlc_timeout_sec",
|
get_param("dmlc_timeout_sec", static_cast<Integer::Int>(DefaultTimeoutSec()), Integer{});
|
||||||
static_cast<Integer::Int>(DefaultTimeoutSec()));
|
|
||||||
auto task_id = get_param("dmlc_task_id", std::string{}, String{});
|
auto task_id = get_param("dmlc_task_id", std::string{}, String{});
|
||||||
|
|
||||||
if (type == "rabit") {
|
if (type == "rabit") {
|
||||||
auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
|
auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
|
||||||
auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
|
auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
|
||||||
|
auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
|
||||||
auto ptr =
|
auto ptr =
|
||||||
new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{ // NOLINT
|
new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{ // NOLINT
|
||||||
host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
|
host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
|
||||||
static_cast<std::int32_t>(retry), task_id}},
|
static_cast<std::int32_t>(retry), task_id, nccl}},
|
||||||
std::shared_ptr<Coll>(new Coll{})}; // NOLINT
|
std::shared_ptr<Coll>(new Coll{})}; // NOLINT
|
||||||
return ptr;
|
return ptr;
|
||||||
} else if (type == "federated") {
|
} else if (type == "federated") {
|
||||||
|
|||||||
@ -17,14 +17,16 @@ namespace xgboost::collective {
|
|||||||
* collective implementations.
|
* collective implementations.
|
||||||
*/
|
*/
|
||||||
class CommGroup {
|
class CommGroup {
|
||||||
std::shared_ptr<Comm> comm_;
|
std::shared_ptr<HostComm> comm_;
|
||||||
mutable std::shared_ptr<Comm> gpu_comm_;
|
mutable std::shared_ptr<Comm> gpu_comm_;
|
||||||
|
|
||||||
std::shared_ptr<Coll> backend_;
|
std::shared_ptr<Coll> backend_;
|
||||||
mutable std::shared_ptr<Coll> gpu_coll_; // lazy initialization
|
mutable std::shared_ptr<Coll> gpu_coll_; // lazy initialization
|
||||||
|
|
||||||
CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
|
CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
|
||||||
: comm_{std::move(comm)}, backend_{std::move(coll)} {}
|
: comm_{std::dynamic_pointer_cast<HostComm>(comm)}, backend_{std::move(coll)} {
|
||||||
|
CHECK(comm_);
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CommGroup();
|
CommGroup();
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
*/
|
*/
|
||||||
#include "communicator.h"
|
#include "communicator.h"
|
||||||
|
|
||||||
|
#include "comm.h"
|
||||||
#include "in_memory_communicator.h"
|
#include "in_memory_communicator.h"
|
||||||
#include "noop_communicator.h"
|
#include "noop_communicator.h"
|
||||||
#include "rabit_communicator.h"
|
#include "rabit_communicator.h"
|
||||||
@ -14,8 +15,12 @@
|
|||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
|
thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
|
||||||
thread_local CommunicatorType Communicator::type_{};
|
thread_local CommunicatorType Communicator::type_{};
|
||||||
|
thread_local std::string Communicator::nccl_path_{};
|
||||||
|
|
||||||
void Communicator::Init(Json const& config) {
|
void Communicator::Init(Json const& config) {
|
||||||
|
auto nccl = OptionalArg<String>(config, "dmlc_nccl_path", std::string{DefaultNcclName()});
|
||||||
|
nccl_path_ = nccl;
|
||||||
|
|
||||||
auto type = GetTypeFromEnv();
|
auto type = GetTypeFromEnv();
|
||||||
auto const arg = GetTypeFromConfig(config);
|
auto const arg = GetTypeFromConfig(config);
|
||||||
if (arg != CommunicatorType::kUnknown) {
|
if (arg != CommunicatorType::kUnknown) {
|
||||||
|
|||||||
@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
|
|||||||
#ifdef XGBOOST_USE_NCCL
|
#ifdef XGBOOST_USE_NCCL
|
||||||
switch (type_) {
|
switch (type_) {
|
||||||
case CommunicatorType::kRabit:
|
case CommunicatorType::kRabit:
|
||||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
|
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
|
||||||
break;
|
break;
|
||||||
case CommunicatorType::kFederated:
|
case CommunicatorType::kFederated:
|
||||||
case CommunicatorType::kInMemory:
|
case CommunicatorType::kInMemory:
|
||||||
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
|
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
|
||||||
break;
|
break;
|
||||||
case CommunicatorType::kInMemoryNccl:
|
case CommunicatorType::kInMemoryNccl:
|
||||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
|
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
|
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
|
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
|
||||||
|
|||||||
@ -234,6 +234,7 @@ class Communicator {
|
|||||||
|
|
||||||
static thread_local std::unique_ptr<Communicator> communicator_;
|
static thread_local std::unique_ptr<Communicator> communicator_;
|
||||||
static thread_local CommunicatorType type_;
|
static thread_local CommunicatorType type_;
|
||||||
|
static thread_local std::string nccl_path_;
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
|
static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -2,12 +2,14 @@
|
|||||||
* Copyright 2023 XGBoost contributors
|
* Copyright 2023 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#if defined(XGBOOST_USE_NCCL)
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
|
#include "comm.cuh"
|
||||||
#include "nccl_device_communicator.cuh"
|
#include "nccl_device_communicator.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace collective {
|
namespace collective {
|
||||||
|
|
||||||
NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
|
NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync,
|
||||||
|
StringView nccl_path)
|
||||||
: device_ordinal_{device_ordinal},
|
: device_ordinal_{device_ordinal},
|
||||||
needs_sync_{needs_sync},
|
needs_sync_{needs_sync},
|
||||||
world_size_{GetWorldSize()},
|
world_size_{GetWorldSize()},
|
||||||
@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
|
|||||||
if (world_size_ == 1) {
|
if (world_size_ == 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
stub_ = std::make_shared<NcclStub>(std::move(nccl_path));
|
||||||
|
|
||||||
std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
|
std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
|
||||||
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
||||||
@ -43,7 +46,9 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
|
|||||||
|
|
||||||
nccl_unique_id_ = GetUniqueId();
|
nccl_unique_id_ = GetUniqueId();
|
||||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||||
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
|
auto rc =
|
||||||
|
GetNCCLResult(stub_, stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
NcclDeviceCommunicator::~NcclDeviceCommunicator() {
|
NcclDeviceCommunicator::~NcclDeviceCommunicator() {
|
||||||
@ -51,7 +56,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (nccl_comm_) {
|
if (nccl_comm_) {
|
||||||
dh::safe_nccl(ncclCommDestroy(nccl_comm_));
|
auto rc = GetNCCLResult(stub_, stub_->CommDestroy(nccl_comm_));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||||
@ -137,8 +143,10 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
|
|||||||
auto *device_buffer = buffer.data().get();
|
auto *device_buffer = buffer.data().get();
|
||||||
|
|
||||||
// First gather data from all the workers.
|
// First gather data from all the workers.
|
||||||
dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
|
auto rc = GetNCCLResult(
|
||||||
|
stub_, stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
|
||||||
nccl_comm_, dh::DefaultStream()));
|
nccl_comm_, dh::DefaultStream()));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
if (needs_sync_) {
|
if (needs_sync_) {
|
||||||
dh::DefaultStream().Sync();
|
dh::DefaultStream().Sync();
|
||||||
}
|
}
|
||||||
@ -170,9 +178,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
|
|||||||
if (IsBitwiseOp(op)) {
|
if (IsBitwiseOp(op)) {
|
||||||
BitwiseAllReduce(send_receive_buffer, count, data_type, op);
|
BitwiseAllReduce(send_receive_buffer, count, data_type, op);
|
||||||
} else {
|
} else {
|
||||||
dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
|
auto rc = GetNCCLResult(stub_, stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
|
||||||
GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
|
GetNcclDataType(data_type), GetNcclRedOp(op),
|
||||||
dh::DefaultStream()));
|
nccl_comm_, dh::DefaultStream()));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
allreduce_bytes_ += count * GetTypeSize(data_type);
|
allreduce_bytes_ += count * GetTypeSize(data_type);
|
||||||
allreduce_calls_ += 1;
|
allreduce_calls_ += 1;
|
||||||
@ -185,8 +194,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
|
|||||||
}
|
}
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||||
dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
|
auto rc = GetNCCLResult(stub_, stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8,
|
||||||
dh::DefaultStream()));
|
nccl_comm_, dh::DefaultStream()));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
|
void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
|
||||||
@ -206,14 +216,19 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
|
|||||||
receive_buffer->resize(total_bytes);
|
receive_buffer->resize(total_bytes);
|
||||||
|
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
dh::safe_nccl(ncclGroupStart());
|
auto rc = Success() << [&] { return GetNCCLResult(stub_, stub_->GroupStart()); } << [&] {
|
||||||
for (int32_t i = 0; i < world_size_; ++i) {
|
for (int32_t i = 0; i < world_size_; ++i) {
|
||||||
size_t as_bytes = segments->at(i);
|
size_t as_bytes = segments->at(i);
|
||||||
dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
|
auto rc = GetNCCLResult(
|
||||||
ncclChar, i, nccl_comm_, dh::DefaultStream()));
|
stub_, stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
|
||||||
offset += as_bytes;
|
ncclChar, i, nccl_comm_, dh::DefaultStream()));
|
||||||
}
|
if (!rc.OK()) {
|
||||||
dh::safe_nccl(ncclGroupEnd());
|
return rc;
|
||||||
|
}
|
||||||
|
offset += as_bytes;
|
||||||
|
}
|
||||||
|
return Success();
|
||||||
|
} << [&] { return GetNCCLResult(stub_, stub_->GroupEnd()); };
|
||||||
}
|
}
|
||||||
|
|
||||||
void NcclDeviceCommunicator::Synchronize() {
|
void NcclDeviceCommunicator::Synchronize() {
|
||||||
|
|||||||
@ -4,8 +4,10 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
|
#include "comm.cuh"
|
||||||
#include "communicator.h"
|
#include "communicator.h"
|
||||||
#include "device_communicator.cuh"
|
#include "device_communicator.cuh"
|
||||||
|
#include "nccl_stub.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace collective {
|
namespace collective {
|
||||||
@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
|
|||||||
* needed. The in-memory communicator is used in tests with multiple threads, each thread
|
* needed. The in-memory communicator is used in tests with multiple threads, each thread
|
||||||
* representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
|
* representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
|
||||||
*/
|
*/
|
||||||
explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
|
explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path);
|
||||||
~NcclDeviceCommunicator() override;
|
~NcclDeviceCommunicator() override;
|
||||||
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
|
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
|
||||||
Operation op) override;
|
Operation op) override;
|
||||||
@ -64,7 +66,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
|
|||||||
static const int kRootRank = 0;
|
static const int kRootRank = 0;
|
||||||
ncclUniqueId id;
|
ncclUniqueId id;
|
||||||
if (rank_ == kRootRank) {
|
if (rank_ == kRootRank) {
|
||||||
dh::safe_nccl(ncclGetUniqueId(&id));
|
auto rc = GetNCCLResult(stub_, stub_->GetUniqueId(&id));
|
||||||
|
CHECK(rc.OK()) << rc.Report();
|
||||||
}
|
}
|
||||||
Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
|
Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
|
||||||
return id;
|
return id;
|
||||||
@ -78,6 +81,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
|
|||||||
int const world_size_;
|
int const world_size_;
|
||||||
int const rank_;
|
int const rank_;
|
||||||
ncclComm_t nccl_comm_{};
|
ncclComm_t nccl_comm_{};
|
||||||
|
std::shared_ptr<NcclStub> stub_;
|
||||||
ncclUniqueId nccl_unique_id_{};
|
ncclUniqueId nccl_unique_id_{};
|
||||||
size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated.
|
size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated.
|
||||||
size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls.
|
size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls.
|
||||||
|
|||||||
109
src/collective/nccl_stub.cc
Normal file
109
src/collective/nccl_stub.cc
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2023, XGBoost Contributors
|
||||||
|
*/
|
||||||
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
|
#include "nccl_stub.h"
|
||||||
|
|
||||||
|
#include <cuda.h> // for CUDA_VERSION
|
||||||
|
#include <dlfcn.h> // for dlclose, dlsym, dlopen
|
||||||
|
#include <nccl.h>
|
||||||
|
|
||||||
|
#include <cstdint> // for int32_t
|
||||||
|
#include <sstream> // for stringstream
|
||||||
|
#include <string> // for string
|
||||||
|
#include <utility> // for move
|
||||||
|
|
||||||
|
#include "xgboost/logging.h"
|
||||||
|
|
||||||
|
namespace xgboost::collective {
|
||||||
|
NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
|
||||||
|
#if defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
CHECK(!path_.empty()) << "Empty path for NCCL.";
|
||||||
|
|
||||||
|
auto cu_major = (CUDA_VERSION) / 1000;
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << R"m(
|
||||||
|
|
||||||
|
If XGBoost is installed from PyPI with pip, the error can fixed by:
|
||||||
|
|
||||||
|
- Run `pip install nvidia-nccl-cu)m"
|
||||||
|
<< cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ").";
|
||||||
|
ss << R"m(
|
||||||
|
|
||||||
|
Otherwise, please refer to:
|
||||||
|
|
||||||
|
https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting
|
||||||
|
|
||||||
|
for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package
|
||||||
|
no long bundles NCCL in the binary wheel.
|
||||||
|
|
||||||
|
)m";
|
||||||
|
auto help = ss.str();
|
||||||
|
std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n "};
|
||||||
|
|
||||||
|
auto safe_load = [&](auto t, StringView name) {
|
||||||
|
std::stringstream errs;
|
||||||
|
auto ptr = reinterpret_cast<decltype(t)>(dlsym(handle_, name.c_str()));
|
||||||
|
if (!ptr) {
|
||||||
|
errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n "
|
||||||
|
<< dlerror() << help;
|
||||||
|
LOG(FATAL) << errs.str();
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
handle_ = dlopen(path_.c_str(), RTLD_LAZY);
|
||||||
|
if (!handle_) {
|
||||||
|
LOG(FATAL) << msg << dlerror() << help;
|
||||||
|
}
|
||||||
|
|
||||||
|
allreduce_ = safe_load(allreduce_, "ncclAllReduce");
|
||||||
|
broadcast_ = safe_load(broadcast_, "ncclBroadcast");
|
||||||
|
allgather_ = safe_load(allgather_, "ncclAllGather");
|
||||||
|
comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
|
||||||
|
comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
|
||||||
|
get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
|
||||||
|
send_ = safe_load(send_, "ncclSend");
|
||||||
|
recv_ = safe_load(recv_, "ncclRecv");
|
||||||
|
group_start_ = safe_load(group_start_, "ncclGroupStart");
|
||||||
|
group_end_ = safe_load(group_end_, "ncclGroupEnd");
|
||||||
|
get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString");
|
||||||
|
get_version_ = safe_load(get_version_, "ncclGetVersion");
|
||||||
|
|
||||||
|
std::int32_t v;
|
||||||
|
CHECK_EQ(get_version_(&v), ncclSuccess);
|
||||||
|
auto patch = v % 100;
|
||||||
|
auto minor = (v / 100) % 100;
|
||||||
|
auto major = v / 10000;
|
||||||
|
|
||||||
|
LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_
|
||||||
|
<< "`" << std::endl;
|
||||||
|
#else
|
||||||
|
allreduce_ = ncclAllReduce;
|
||||||
|
broadcast_ = ncclBroadcast;
|
||||||
|
allgather_ = ncclAllGather;
|
||||||
|
comm_init_rank_ = ncclCommInitRank;
|
||||||
|
comm_destroy_ = ncclCommDestroy;
|
||||||
|
get_uniqueid_ = ncclGetUniqueId;
|
||||||
|
send_ = ncclSend;
|
||||||
|
recv_ = ncclRecv;
|
||||||
|
group_start_ = ncclGroupStart;
|
||||||
|
group_end_ = ncclGroupEnd;
|
||||||
|
get_error_string_ = ncclGetErrorString;
|
||||||
|
get_version_ = ncclGetVersion;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
NcclStub::~NcclStub() { // NOLINT
|
||||||
|
#if defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
if (handle_) {
|
||||||
|
auto rc = dlclose(handle_);
|
||||||
|
if (rc != 0) {
|
||||||
|
LOG(WARNING) << "Failed to close NCCL handle:" << dlerror();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
handle_ = nullptr;
|
||||||
|
#endif // defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
}
|
||||||
|
} // namespace xgboost::collective
|
||||||
|
#endif // defined(XGBOOST_USE_NCCL)
|
||||||
94
src/collective/nccl_stub.h
Normal file
94
src/collective/nccl_stub.h
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2023, XGBoost Contributors
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#if defined(XGBOOST_USE_NCCL)
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
|
#include <nccl.h>
|
||||||
|
|
||||||
|
#include <string> // for string
|
||||||
|
|
||||||
|
#include "xgboost/string_view.h" // for StringView
|
||||||
|
|
||||||
|
namespace xgboost::collective {
|
||||||
|
class NcclStub {
|
||||||
|
#if defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
void* handle_{nullptr};
|
||||||
|
#endif // defined(XGBOOST_USE_DLOPEN_NCCL)
|
||||||
|
std::string path_;
|
||||||
|
|
||||||
|
decltype(ncclAllReduce)* allreduce_{nullptr};
|
||||||
|
decltype(ncclBroadcast)* broadcast_{nullptr};
|
||||||
|
decltype(ncclAllGather)* allgather_{nullptr};
|
||||||
|
decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
|
||||||
|
decltype(ncclCommDestroy)* comm_destroy_{nullptr};
|
||||||
|
decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
|
||||||
|
decltype(ncclSend)* send_{nullptr};
|
||||||
|
decltype(ncclRecv)* recv_{nullptr};
|
||||||
|
decltype(ncclGroupStart)* group_start_{nullptr};
|
||||||
|
decltype(ncclGroupEnd)* group_end_{nullptr};
|
||||||
|
decltype(ncclGetErrorString)* get_error_string_{nullptr};
|
||||||
|
decltype(ncclGetVersion)* get_version_{nullptr};
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit NcclStub(StringView path);
|
||||||
|
~NcclStub();
|
||||||
|
|
||||||
|
[[nodiscard]] ncclResult_t Allreduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||||
|
cudaStream_t stream) const {
|
||||||
|
CHECK(allreduce_);
|
||||||
|
return this->allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t Broadcast(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
|
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||||
|
cudaStream_t stream) const {
|
||||||
|
CHECK(broadcast_);
|
||||||
|
return this->broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||||
|
ncclDataType_t datatype, ncclComm_t comm,
|
||||||
|
cudaStream_t stream) const {
|
||||||
|
CHECK(allgather_);
|
||||||
|
return this->allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
|
||||||
|
int rank) const {
|
||||||
|
CHECK(comm_init_rank_);
|
||||||
|
return this->comm_init_rank_(comm, nranks, commId, rank);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t CommDestroy(ncclComm_t comm) const {
|
||||||
|
CHECK(comm_destroy_);
|
||||||
|
return this->comm_destroy_(comm);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] ncclResult_t GetUniqueId(ncclUniqueId* uniqueId) const {
|
||||||
|
CHECK(get_uniqueid_);
|
||||||
|
return this->get_uniqueid_(uniqueId);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t Send(const void* sendbuff, size_t count, ncclDataType_t datatype,
|
||||||
|
int peer, ncclComm_t comm, cudaStream_t stream) {
|
||||||
|
CHECK(send_);
|
||||||
|
return send_(sendbuff, count, datatype, peer, comm, stream);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
|
ncclComm_t comm, cudaStream_t stream) const {
|
||||||
|
CHECK(recv_);
|
||||||
|
return recv_(recvbuff, count, datatype, peer, comm, stream);
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t GroupStart() const {
|
||||||
|
CHECK(group_start_);
|
||||||
|
return group_start_();
|
||||||
|
}
|
||||||
|
[[nodiscard]] ncclResult_t GroupEnd() const {
|
||||||
|
CHECK(group_end_);
|
||||||
|
return group_end_();
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
|
||||||
|
return get_error_string_(result);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace xgboost::collective
|
||||||
|
|
||||||
|
#endif // defined(XGBOOST_USE_NCCL)
|
||||||
@ -115,30 +115,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) { // NOLINT
|
|||||||
}
|
}
|
||||||
namespace dh {
|
namespace dh {
|
||||||
|
|
||||||
#ifdef XGBOOST_USE_NCCL
|
|
||||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
|
||||||
|
|
||||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
|
|
||||||
if (code != ncclSuccess) {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
|
|
||||||
ss << " " << file << "(" << line << ")\n";
|
|
||||||
if (code == ncclUnhandledCudaError) {
|
|
||||||
// nccl usually preserves the last error so we can get more details.
|
|
||||||
auto err = cudaPeekAtLastError();
|
|
||||||
ss << " CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
|
|
||||||
} else if (code == ncclSystemError) {
|
|
||||||
ss << " This might be caused by a network configuration issue. Please consider specifying "
|
|
||||||
"the network interface for NCCL via environment variables listed in its reference: "
|
|
||||||
"`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
|
|
||||||
}
|
|
||||||
LOG(FATAL) << ss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline int32_t CudaGetPointerDevice(void const *ptr) {
|
inline int32_t CudaGetPointerDevice(void const *ptr) {
|
||||||
int32_t device = -1;
|
int32_t device = -1;
|
||||||
cudaPointerAttributes attr;
|
cudaPointerAttributes attr;
|
||||||
|
|||||||
@ -21,11 +21,18 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
|
|||||||
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
|
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
|
||||||
|
|
||||||
echo "--- Build libxgboost from the source"
|
echo "--- Build libxgboost from the source"
|
||||||
$command_wrapper tests/ci_build/prune_libnccl.sh
|
$command_wrapper tests/ci_build/build_via_cmake.sh \
|
||||||
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
|
-DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
|
||||||
-DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
|
-DUSE_CUDA=ON \
|
||||||
-DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
|
-DUSE_OPENMP=ON \
|
||||||
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
|
-DHIDE_CXX_SYMBOLS=ON \
|
||||||
|
-DPLUGIN_FEDERATED=ON \
|
||||||
|
-DPLUGIN_RMM=ON \
|
||||||
|
-DUSE_NCCL=ON \
|
||||||
|
-DUSE_NCCL_LIB_PATH=ON \
|
||||||
|
-DNCCL_INCLUDE_DIR=/usr/include \
|
||||||
|
-DUSE_DLOPEN_NCCL=ON \
|
||||||
|
${arch_flag}
|
||||||
echo "--- Build binary wheel"
|
echo "--- Build binary wheel"
|
||||||
$command_wrapper bash -c \
|
$command_wrapper bash -c \
|
||||||
"cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
|
"cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
|
||||||
|
|||||||
@ -21,11 +21,17 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
|
|||||||
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
|
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
|
||||||
|
|
||||||
echo "--- Build libxgboost from the source"
|
echo "--- Build libxgboost from the source"
|
||||||
$command_wrapper tests/ci_build/prune_libnccl.sh
|
$command_wrapper tests/ci_build/build_via_cmake.sh \
|
||||||
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \
|
-DCMAKE_PREFIX_PATH="/opt/grpc" \
|
||||||
-DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
|
-DUSE_CUDA=ON \
|
||||||
-DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
|
-DUSE_OPENMP=ON \
|
||||||
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
|
-DHIDE_CXX_SYMBOLS=ON \
|
||||||
|
-DPLUGIN_FEDERATED=ON \
|
||||||
|
-DUSE_NCCL=ON \
|
||||||
|
-DUSE_NCCL_LIB_PATH=ON \
|
||||||
|
-DNCCL_INCLUDE_DIR=/usr/include \
|
||||||
|
-DUSE_DLOPEN_NCCL=ON \
|
||||||
|
${arch_flag}
|
||||||
echo "--- Build binary wheel"
|
echo "--- Build binary wheel"
|
||||||
$command_wrapper bash -c \
|
$command_wrapper bash -c \
|
||||||
"cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
|
"cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
|
||||||
|
|||||||
@ -10,6 +10,7 @@ chmod +x build/testxgboost
|
|||||||
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
||||||
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
||||||
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
||||||
|
--build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
|
||||||
build/testxgboost
|
build/testxgboost
|
||||||
|
|
||||||
echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
|
echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
|
||||||
|
|||||||
@ -13,4 +13,5 @@ chmod +x build/testxgboost
|
|||||||
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
||||||
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
||||||
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
||||||
|
--build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
|
||||||
build/testxgboost --gtest_filter=*MGPU*
|
build/testxgboost --gtest_filter=*MGPU*
|
||||||
|
|||||||
@ -24,7 +24,8 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
|
|||||||
|
|
||||||
command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
|
command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
|
||||||
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
|
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
|
||||||
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
|
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
|
||||||
|
`"NCCL_VERSION_ARG=$NCCL_VERSION"
|
||||||
|
|
||||||
# Run specified test suite
|
# Run specified test suite
|
||||||
case "$suite" in
|
case "$suite" in
|
||||||
|
|||||||
@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
|
|||||||
FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
|
FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
|
||||||
ARG CUDA_VERSION_ARG
|
ARG CUDA_VERSION_ARG
|
||||||
ARG RAPIDS_VERSION_ARG
|
ARG RAPIDS_VERSION_ARG
|
||||||
|
ARG NCCL_VERSION_ARG
|
||||||
|
|
||||||
# Environment
|
# Environment
|
||||||
ENV DEBIAN_FRONTEND noninteractive
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
@ -23,7 +24,9 @@ RUN \
|
|||||||
conda install -c conda-forge mamba && \
|
conda install -c conda-forge mamba && \
|
||||||
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||||
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
|
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
|
||||||
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \
|
||||||
|
dask \
|
||||||
|
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
||||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||||
pyspark>=3.4.0 cloudpickle cuda-python && \
|
pyspark>=3.4.0 cloudpickle cuda-python && \
|
||||||
mamba clean --all && \
|
mamba clean --all && \
|
||||||
|
|||||||
@ -27,7 +27,7 @@ RUN \
|
|||||||
wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
|
wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
|
||||||
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
|
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
|
||||||
yum -y update && \
|
yum -y update && \
|
||||||
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
|
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
|
||||||
rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
|
rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
|
||||||
|
|
||||||
ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
|
ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
|
||||||
|
|||||||
@ -1,35 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
rm -rf tmp_nccl
|
|
||||||
|
|
||||||
mkdir tmp_nccl
|
|
||||||
pushd tmp_nccl
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
cat << EOF > test.cu
|
|
||||||
int main(void) { return 0; }
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat << EOF > CMakeLists.txt
|
|
||||||
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
|
|
||||||
project(gencode_extractor CXX C)
|
|
||||||
cmake_policy(SET CMP0104 NEW)
|
|
||||||
set(CMAKE_CUDA_HOST_COMPILER \${CMAKE_CXX_COMPILER})
|
|
||||||
enable_language(CUDA)
|
|
||||||
include(../cmake/Utils.cmake)
|
|
||||||
compute_cmake_cuda_archs("")
|
|
||||||
add_library(test OBJECT test.cu)
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
|
|
||||||
gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')
|
|
||||||
|
|
||||||
nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a
|
|
||||||
|
|
||||||
popd
|
|
||||||
rm -rf tmp_nccl
|
|
||||||
|
|
||||||
set +x
|
|
||||||
@ -1,22 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from contextlib import contextmanager
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def cd(path):
|
|
||||||
path = os.path.normpath(path)
|
|
||||||
cwd = os.getcwd()
|
|
||||||
os.chdir(path)
|
|
||||||
print("cd " + path)
|
|
||||||
try:
|
|
||||||
yield path
|
|
||||||
finally:
|
|
||||||
os.chdir(cwd)
|
|
||||||
|
|
||||||
|
from test_utils import DirectoryExcursion
|
||||||
|
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
print('Usage: {} [wheel to rename] [commit id] [platform tag]'.format(sys.argv[0]))
|
print("Usage: {} [wheel to rename] [commit id] [platform tag]".format(sys.argv[0]))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
@ -26,20 +14,26 @@ platform_tag = sys.argv[3]
|
|||||||
|
|
||||||
dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)
|
dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)
|
||||||
|
|
||||||
with cd(dirname):
|
with DirectoryExcursion(dirname):
|
||||||
tokens = basename.split('-')
|
tokens = basename.split("-")
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
version = tokens[1].split('+')[0]
|
version = tokens[1].split("+")[0]
|
||||||
keywords = {'pkg_name': tokens[0],
|
keywords = {
|
||||||
'version': version,
|
"pkg_name": tokens[0],
|
||||||
'commit_id': commit_id,
|
"version": version,
|
||||||
'platform_tag': platform_tag}
|
"commit_id": commit_id,
|
||||||
new_name = '{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl'.format(**keywords)
|
"platform_tag": platform_tag,
|
||||||
print('Renaming {} to {}...'.format(basename, new_name))
|
}
|
||||||
|
new_name = "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format(
|
||||||
|
**keywords
|
||||||
|
)
|
||||||
|
print("Renaming {} to {}...".format(basename, new_name))
|
||||||
if os.path.isfile(new_name):
|
if os.path.isfile(new_name):
|
||||||
os.remove(new_name)
|
os.remove(new_name)
|
||||||
os.rename(basename, new_name)
|
os.rename(basename, new_name)
|
||||||
|
|
||||||
filesize = os.path.getsize(new_name) / 1024 / 1024 # MB
|
filesize = os.path.getsize(new_name) / 1024 / 1024 # MB
|
||||||
|
print(f"Wheel size: {filesize}")
|
||||||
|
|
||||||
msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
|
msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
|
||||||
assert filesize <= 300, msg
|
assert filesize <= 300, msg
|
||||||
|
|||||||
@ -90,10 +90,10 @@ class Worker : public NCCLWorkerForTest {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class AllgatherTestGPU : public SocketTest {};
|
class MGPUAllgatherTest : public SocketTest {};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
TEST_F(AllgatherTestGPU, MGPUTestVRing) {
|
TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
|
||||||
auto n_workers = common::AllVisibleGPUs();
|
auto n_workers = common::AllVisibleGPUs();
|
||||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||||
std::int32_t r) {
|
std::int32_t r) {
|
||||||
@ -104,7 +104,7 @@ TEST_F(AllgatherTestGPU, MGPUTestVRing) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(AllgatherTestGPU, MGPUTestVBcast) {
|
TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
|
||||||
auto n_workers = common::AllVisibleGPUs();
|
auto n_workers = common::AllVisibleGPUs();
|
||||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||||
std::int32_t r) {
|
std::int32_t r) {
|
||||||
|
|||||||
@ -5,17 +5,15 @@
|
|||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <thrust/host_vector.h> // for host_vector
|
#include <thrust/host_vector.h> // for host_vector
|
||||||
|
|
||||||
#include "../../../src/collective/coll.h" // for Coll
|
|
||||||
#include "../../../src/common/common.h"
|
#include "../../../src/common/common.h"
|
||||||
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
||||||
#include "../../../src/common/type.h" // for EraseType
|
#include "../../../src/common/type.h" // for EraseType
|
||||||
#include "../helpers.h" // for MakeCUDACtx
|
|
||||||
#include "test_worker.cuh" // for NCCLWorkerForTest
|
#include "test_worker.cuh" // for NCCLWorkerForTest
|
||||||
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
#include "test_worker.h" // for WorkerForTest, TestDistributed
|
||||||
|
|
||||||
namespace xgboost::collective {
|
namespace xgboost::collective {
|
||||||
namespace {
|
namespace {
|
||||||
class AllreduceTestGPU : public SocketTest {};
|
class MGPUAllreduceTest : public SocketTest {};
|
||||||
|
|
||||||
class Worker : public NCCLWorkerForTest {
|
class Worker : public NCCLWorkerForTest {
|
||||||
public:
|
public:
|
||||||
@ -47,7 +45,7 @@ class Worker : public NCCLWorkerForTest {
|
|||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
TEST_F(AllreduceTestGPU, BitOr) {
|
TEST_F(MGPUAllreduceTest, BitOr) {
|
||||||
auto n_workers = common::AllVisibleGPUs();
|
auto n_workers = common::AllVisibleGPUs();
|
||||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||||
std::int32_t r) {
|
std::int32_t r) {
|
||||||
@ -57,7 +55,7 @@ TEST_F(AllreduceTestGPU, BitOr) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(AllreduceTestGPU, Sum) {
|
TEST_F(MGPUAllreduceTest, Sum) {
|
||||||
auto n_workers = common::AllVisibleGPUs();
|
auto n_workers = common::AllVisibleGPUs();
|
||||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||||
std::int32_t r) {
|
std::int32_t r) {
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
#include <bitset>
|
#include <bitset>
|
||||||
#include <string> // for string
|
#include <string> // for string
|
||||||
|
|
||||||
|
#include "../../../src/collective/comm.cuh"
|
||||||
#include "../../../src/collective/communicator-inl.cuh"
|
#include "../../../src/collective/communicator-inl.cuh"
|
||||||
#include "../../../src/collective/nccl_device_communicator.cuh"
|
#include "../../../src/collective/nccl_device_communicator.cuh"
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
@ -16,17 +17,15 @@ namespace xgboost {
|
|||||||
namespace collective {
|
namespace collective {
|
||||||
|
|
||||||
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
||||||
auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
|
auto construct = []() { NcclDeviceCommunicator comm{-1, false, DefaultNcclName()}; };
|
||||||
EXPECT_THROW(construct(), dmlc::Error);
|
EXPECT_THROW(construct(), dmlc::Error);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
|
TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
|
||||||
try {
|
auto stub = std::make_shared<NcclStub>(DefaultNcclName());
|
||||||
dh::safe_nccl(ncclSystemError);
|
auto rc = GetNCCLResult(stub, ncclSystemError);
|
||||||
} catch (dmlc::Error const& e) {
|
auto msg = rc.Report();
|
||||||
auto str = std::string{e.what()};
|
ASSERT_TRUE(msg.find("environment variables") != std::string::npos);
|
||||||
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|||||||
@ -33,7 +33,7 @@ class WorkerForTest {
|
|||||||
tracker_port_{port},
|
tracker_port_{port},
|
||||||
world_size_{world},
|
world_size_{world},
|
||||||
task_id_{"t:" + std::to_string(rank)},
|
task_id_{"t:" + std::to_string(rank)},
|
||||||
comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} {
|
comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
|
||||||
CHECK_EQ(world_size_, comm_.World());
|
CHECK_EQ(world_size_, comm_.World());
|
||||||
}
|
}
|
||||||
virtual ~WorkerForTest() = default;
|
virtual ~WorkerForTest() = default;
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from hypothesis._settings import duration
|
|||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost import testing as tm
|
from xgboost import testing as tm
|
||||||
|
from xgboost.collective import CommunicatorContext
|
||||||
from xgboost.testing.params import hist_parameter_strategy
|
from xgboost.testing.params import hist_parameter_strategy
|
||||||
|
|
||||||
pytestmark = [
|
pytestmark = [
|
||||||
@ -572,6 +573,65 @@ def test_with_asyncio(local_cuda_client: Client) -> None:
|
|||||||
assert isinstance(output["history"], dict)
|
assert isinstance(output["history"], dict)
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_nccl(local_cuda_client: Client) -> None:
|
||||||
|
client = local_cuda_client
|
||||||
|
workers = tm.get_client_workers(client)
|
||||||
|
args = client.sync(
|
||||||
|
dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(wid: int) -> None:
|
||||||
|
ctx = CommunicatorContext(dmlc_nccl_path="foo", **args)
|
||||||
|
X, y, w = tm.make_regression(n_samples=10, n_features=10, use_cupy=True)
|
||||||
|
|
||||||
|
with ctx:
|
||||||
|
with pytest.raises(ValueError, match=r"pip install"):
|
||||||
|
xgb.QuantileDMatrix(X, y, weight=w)
|
||||||
|
|
||||||
|
futures = client.map(run, range(len(workers)), workers=workers)
|
||||||
|
client.gather(futures)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||||
|
def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None:
|
||||||
|
X, y, w = tm.make_regression(128, 16, use_cupy=True)
|
||||||
|
|
||||||
|
def make_model() -> None:
|
||||||
|
xgb.XGBRegressor(
|
||||||
|
device="cuda",
|
||||||
|
tree_method=tree_method,
|
||||||
|
objective="reg:quantileerror",
|
||||||
|
verbosity=2,
|
||||||
|
quantile_alpha=[0.2, 0.8],
|
||||||
|
).fit(X, y, sample_weight=w)
|
||||||
|
|
||||||
|
# no nccl load when using single-node.
|
||||||
|
with tm.captured_output() as (out, err):
|
||||||
|
make_model()
|
||||||
|
assert out.getvalue().find("NCCL") == -1
|
||||||
|
assert err.getvalue().find("NCCL") == -1
|
||||||
|
|
||||||
|
client = local_cuda_client
|
||||||
|
workers = tm.get_client_workers(client)
|
||||||
|
args = client.sync(
|
||||||
|
dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
|
||||||
|
)
|
||||||
|
|
||||||
|
# nccl is loaded
|
||||||
|
def run(wid: int) -> None:
|
||||||
|
# FIXME(jiamingy): https://github.com/dmlc/xgboost/issues/9147
|
||||||
|
from xgboost.core import _LIB, _register_log_callback
|
||||||
|
_register_log_callback(_LIB)
|
||||||
|
|
||||||
|
with CommunicatorContext(**args):
|
||||||
|
with tm.captured_output() as (out, err):
|
||||||
|
make_model()
|
||||||
|
assert out.getvalue().find("Loaded shared NCCL") != -1, out.getvalue()
|
||||||
|
|
||||||
|
futures = client.map(run, range(len(workers)), workers=workers)
|
||||||
|
client.gather(futures)
|
||||||
|
|
||||||
|
|
||||||
async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
|
async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
|
||||||
async with Client(scheduler_address, asynchronous=True) as client:
|
async with Client(scheduler_address, asynchronous=True) as client:
|
||||||
import cupy as cp
|
import cupy as cp
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user