Use dlopen to load NCCL. (#9796)
This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
This commit is contained in:
@@ -15,6 +15,8 @@ class BuildConfiguration: # pylint: disable=R0902
|
||||
use_cuda: bool = False
|
||||
# Whether to enable NCCL
|
||||
use_nccl: bool = False
|
||||
# Whether to load nccl dynamically
|
||||
use_dlopen_nccl: bool = False
|
||||
# Whether to enable HDFS
|
||||
use_hdfs: bool = False
|
||||
# Whether to enable Azure Storage
|
||||
|
||||
@@ -29,7 +29,8 @@ classifiers = [
|
||||
]
|
||||
dependencies = [
|
||||
"numpy",
|
||||
"scipy"
|
||||
"scipy",
|
||||
"nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
||||
@@ -2,14 +2,15 @@
|
||||
import ctypes
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from enum import IntEnum, unique
|
||||
from typing import Any, Dict, List
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ._typing import _T
|
||||
from .core import _LIB, _check_call, c_str, from_pystr_to_cstr, py_str
|
||||
from .core import _LIB, _check_call, build_info, c_str, from_pystr_to_cstr, py_str
|
||||
|
||||
LOGGER = logging.getLogger("[xgboost.collective]")
|
||||
|
||||
@@ -250,6 +251,31 @@ class CommunicatorContext:
|
||||
|
||||
def __init__(self, **args: Any) -> None:
|
||||
self.args = args
|
||||
key = "dmlc_nccl_path"
|
||||
if args.get(key, None) is not None:
|
||||
return
|
||||
|
||||
binfo = build_info()
|
||||
if not binfo["USE_DLOPEN_NCCL"]:
|
||||
return
|
||||
|
||||
try:
|
||||
# PyPI package of NCCL.
|
||||
from nvidia.nccl import lib
|
||||
|
||||
# There are two versions of nvidia-nccl, one is from PyPI, another one from
|
||||
# nvidia-pyindex. We support only the first one as the second one is too old
|
||||
# (2.9.8 as of writing).
|
||||
if lib.__file__ is not None:
|
||||
dirname: Optional[str] = os.path.dirname(lib.__file__)
|
||||
else:
|
||||
dirname = None
|
||||
|
||||
if dirname:
|
||||
path = os.path.join(dirname, "libnccl.so.2")
|
||||
self.args[key] = path
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
def __enter__(self) -> Dict[str, Any]:
|
||||
init(**self.args)
|
||||
|
||||
@@ -184,6 +184,13 @@ def _py_version() -> str:
|
||||
return f.read().strip()
|
||||
|
||||
|
||||
def _register_log_callback(lib: ctypes.CDLL) -> None:
|
||||
lib.XGBGetLastError.restype = ctypes.c_char_p
|
||||
lib.callback = _get_log_callback_func() # type: ignore
|
||||
if lib.XGBRegisterLogCallback(lib.callback) != 0:
|
||||
raise XGBoostError(lib.XGBGetLastError())
|
||||
|
||||
|
||||
def _load_lib() -> ctypes.CDLL:
|
||||
"""Load xgboost Library."""
|
||||
lib_paths = find_lib_path()
|
||||
@@ -228,10 +235,7 @@ Likely causes:
|
||||
Error message(s): {os_error_list}
|
||||
"""
|
||||
)
|
||||
lib.XGBGetLastError.restype = ctypes.c_char_p
|
||||
lib.callback = _get_log_callback_func() # type: ignore
|
||||
if lib.XGBRegisterLogCallback(lib.callback) != 0:
|
||||
raise XGBoostError(lib.XGBGetLastError())
|
||||
_register_log_callback(lib)
|
||||
|
||||
def parse(ver: str) -> Tuple[int, int, int]:
|
||||
"""Avoid dependency on packaging (PEP 440)."""
|
||||
|
||||
Reference in New Issue
Block a user