Use dlopen to load NCCL. (#9796)

This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release. - Add CMake option to load `nccl` at runtime. - Add an NCCL stub. After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change. At the moment, this is Linux only since we only support MNMG on Linux.
2023-11-22 19:27:31 +08:00
parent fedd9674c8
commit 0715ab3c10
45 changed files with 658 additions and 268 deletions
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -21,11 +21,18 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"

 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
-  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-  -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
+$command_wrapper tests/ci_build/build_via_cmake.sh \
+		 -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
+		 -DUSE_CUDA=ON \
+		 -DUSE_OPENMP=ON \
+		 -DHIDE_CXX_SYMBOLS=ON \
+		 -DPLUGIN_FEDERATED=ON \
+		 -DPLUGIN_RMM=ON \
+		 -DUSE_NCCL=ON \
+		 -DUSE_NCCL_LIB_PATH=ON \
+		 -DNCCL_INCLUDE_DIR=/usr/include \
+		 -DUSE_DLOPEN_NCCL=ON \
+  ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -21,11 +21,17 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"

 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \
-  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-  -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
+$command_wrapper tests/ci_build/build_via_cmake.sh \
+		 -DCMAKE_PREFIX_PATH="/opt/grpc" \
+		 -DUSE_CUDA=ON \
+		 -DUSE_OPENMP=ON \
+		 -DHIDE_CXX_SYMBOLS=ON \
+		 -DPLUGIN_FEDERATED=ON \
+		 -DUSE_NCCL=ON \
+		 -DUSE_NCCL_LIB_PATH=ON \
+		 -DNCCL_INCLUDE_DIR=/usr/include \
+		 -DUSE_DLOPEN_NCCL=ON \
+		 ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -10,6 +10,7 @@ chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
  build/testxgboost

 echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
--- a/tests/buildkite/test-cpp-mgpu.sh
+++ b/tests/buildkite/test-cpp-mgpu.sh
@@ -13,4 +13,5 @@ chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
  build/testxgboost --gtest_filter=*MGPU*
--- a/tests/buildkite/test-python-gpu.sh
+++ b/tests/buildkite/test-python-gpu.sh
@@ -24,7 +24,8 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'

 command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
+		`"NCCL_VERSION_ARG=$NCCL_VERSION"

 # Run specified test suite
 case "$suite" in
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
+ARG NCCL_VERSION_ARG

 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -23,7 +24,9 @@ RUN \
    conda install -c conda-forge mamba && \
    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
        python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
-        dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
+        nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \
+        dask \
+        dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
        numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
        pyspark>=3.4.0 cloudpickle cuda-python && \
    mamba clean --all && \
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -27,7 +27,7 @@ RUN \
    wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    yum -y update && \
-    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
+    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
    rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;

 ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
--- a/tests/ci_build/prune_libnccl.sh
+++ b/tests/ci_build/prune_libnccl.sh
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-rm -rf tmp_nccl
-
-mkdir tmp_nccl
-pushd tmp_nccl
-
-set -x
-
-cat << EOF > test.cu
-int main(void) { return 0; }
-EOF
-
-cat << EOF > CMakeLists.txt
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(gencode_extractor CXX C)
-cmake_policy(SET CMP0104 NEW)
-set(CMAKE_CUDA_HOST_COMPILER \${CMAKE_CXX_COMPILER})
-enable_language(CUDA)
-include(../cmake/Utils.cmake)
-compute_cmake_cuda_archs("")
-add_library(test OBJECT test.cu)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-EOF
-
-cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')
-
-nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a
-
-popd
-rm -rf tmp_nccl
-
-set +x
--- a/tests/ci_build/rename_whl.py
+++ b/tests/ci_build/rename_whl.py
@@ -1,22 +1,10 @@
 import os
 import sys
-from contextlib import contextmanager
-
-
-@contextmanager
-def cd(path):
-    path = os.path.normpath(path)
-    cwd = os.getcwd()
-    os.chdir(path)
-    print("cd " + path)
-    try:
-        yield path
-    finally:
-        os.chdir(cwd)

+from test_utils import DirectoryExcursion

 if len(sys.argv) != 4:
-    print('Usage: {} [wheel to rename] [commit id] [platform tag]'.format(sys.argv[0]))
+    print("Usage: {} [wheel to rename] [commit id] [platform tag]".format(sys.argv[0]))
    sys.exit(1)


@@ -26,20 +14,26 @@ platform_tag = sys.argv[3]

 dirname, basename = os.path.dirname(whl_path), os.path.basename(whl_path)

-with cd(dirname):
-    tokens = basename.split('-')
+with DirectoryExcursion(dirname):
+    tokens = basename.split("-")
    assert len(tokens) == 5
-    version = tokens[1].split('+')[0]
-    keywords = {'pkg_name': tokens[0],
-                'version': version,
-                'commit_id': commit_id,
-                'platform_tag': platform_tag}
-    new_name = '{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl'.format(**keywords)
-    print('Renaming {} to {}...'.format(basename, new_name))
+    version = tokens[1].split("+")[0]
+    keywords = {
+        "pkg_name": tokens[0],
+        "version": version,
+        "commit_id": commit_id,
+        "platform_tag": platform_tag,
+    }
+    new_name = "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format(
+        **keywords
+    )
+    print("Renaming {} to {}...".format(basename, new_name))
    if os.path.isfile(new_name):
        os.remove(new_name)
    os.rename(basename, new_name)

    filesize = os.path.getsize(new_name) / 1024 / 1024  # MB
+    print(f"Wheel size: {filesize}")
+
    msg = f"Limit of wheel size set by PyPI is exceeded. {new_name}: {filesize}"
    assert filesize <= 300, msg
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@@ -90,10 +90,10 @@ class Worker : public NCCLWorkerForTest {
  }
 };

-class AllgatherTestGPU : public SocketTest {};
+class MGPUAllgatherTest : public SocketTest {};
 }  // namespace

-TEST_F(AllgatherTestGPU, MGPUTestVRing) {
+TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
  auto n_workers = common::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
@@ -104,7 +104,7 @@ TEST_F(AllgatherTestGPU, MGPUTestVRing) {
  });
 }

-TEST_F(AllgatherTestGPU, MGPUTestVBcast) {
+TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
  auto n_workers = common::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@@ -5,17 +5,15 @@
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector

-#include "../../../src/collective/coll.h"  // for Coll
 #include "../../../src/common/common.h"
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
-#include "../helpers.h"                            // for MakeCUDACtx
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
 #include "test_worker.h"                           // for WorkerForTest, TestDistributed

 namespace xgboost::collective {
 namespace {
-class AllreduceTestGPU : public SocketTest {};
+class MGPUAllreduceTest : public SocketTest {};

 class Worker : public NCCLWorkerForTest {
 public:
@@ -47,7 +45,7 @@ class Worker : public NCCLWorkerForTest {
 };
 }  // namespace

-TEST_F(AllreduceTestGPU, BitOr) {
+TEST_F(MGPUAllreduceTest, BitOr) {
  auto n_workers = common::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
@@ -57,7 +55,7 @@ TEST_F(AllreduceTestGPU, BitOr) {
  });
 }

-TEST_F(AllreduceTestGPU, Sum) {
+TEST_F(MGPUAllreduceTest, Sum) {
  auto n_workers = common::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -8,6 +8,7 @@
 #include <bitset>
 #include <string>  // for string

+#include "../../../src/collective/comm.cuh"
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/nccl_device_communicator.cuh"
 #include "../helpers.h"
@@ -16,17 +17,15 @@ namespace xgboost {
 namespace collective {

 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1, false, DefaultNcclName()}; };
  EXPECT_THROW(construct(), dmlc::Error);
 }

 TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
-  try {
-    dh::safe_nccl(ncclSystemError);
-  } catch (dmlc::Error const& e) {
-    auto str = std::string{e.what()};
-    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
-  }
+  auto stub = std::make_shared<NcclStub>(DefaultNcclName());
+  auto rc = GetNCCLResult(stub, ncclSystemError);
+  auto msg = rc.Report();
+  ASSERT_TRUE(msg.find("environment variables") != std::string::npos);
 }

 namespace {
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -33,7 +33,7 @@ class WorkerForTest {
        tracker_port_{port},
        world_size_{world},
        task_id_{"t:" + std::to_string(rank)},
-        comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} {
+        comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
    CHECK_EQ(world_size_, comm_.World());
  }
  virtual ~WorkerForTest() = default;
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -12,6 +12,7 @@ from hypothesis._settings import duration

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.collective import CommunicatorContext
 from xgboost.testing.params import hist_parameter_strategy

 pytestmark = [
@@ -572,6 +573,65 @@ def test_with_asyncio(local_cuda_client: Client) -> None:
    assert isinstance(output["history"], dict)


+def test_invalid_nccl(local_cuda_client: Client) -> None:
+    client = local_cuda_client
+    workers = tm.get_client_workers(client)
+    args = client.sync(
+        dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
+    )
+
+    def run(wid: int) -> None:
+        ctx = CommunicatorContext(dmlc_nccl_path="foo", **args)
+        X, y, w = tm.make_regression(n_samples=10, n_features=10, use_cupy=True)
+
+        with ctx:
+            with pytest.raises(ValueError, match=r"pip install"):
+                xgb.QuantileDMatrix(X, y, weight=w)
+
+    futures = client.map(run, range(len(workers)), workers=workers)
+    client.gather(futures)
+
+
+@pytest.mark.parametrize("tree_method", ["hist", "approx"])
+def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None:
+    X, y, w = tm.make_regression(128, 16, use_cupy=True)
+
+    def make_model() -> None:
+        xgb.XGBRegressor(
+            device="cuda",
+            tree_method=tree_method,
+            objective="reg:quantileerror",
+            verbosity=2,
+            quantile_alpha=[0.2, 0.8],
+        ).fit(X, y, sample_weight=w)
+
+    # no nccl load when using single-node.
+    with tm.captured_output() as (out, err):
+        make_model()
+        assert out.getvalue().find("NCCL") == -1
+        assert err.getvalue().find("NCCL") == -1
+
+    client = local_cuda_client
+    workers = tm.get_client_workers(client)
+    args = client.sync(
+        dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
+    )
+
+    # nccl is loaded
+    def run(wid: int) -> None:
+        # FIXME(jiamingy): https://github.com/dmlc/xgboost/issues/9147
+        from xgboost.core import _LIB, _register_log_callback
+        _register_log_callback(_LIB)
+
+        with CommunicatorContext(**args):
+            with tm.captured_output() as (out, err):
+                make_model()
+                assert out.getvalue().find("Loaded shared NCCL") != -1, out.getvalue()
+
+    futures = client.map(run, range(len(workers)), workers=workers)
+    client.gather(futures)
+
+
 async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
    async with Client(scheduler_address, asynchronous=True) as client:
        import cupy as cp