xgboost/tests/ci_build/Dockerfile.gpu_build_centos7
Jiaming Yuan 0715ab3c10
Use dlopen to load NCCL. (#9796)
This PR adds optional support for loading nccl with `dlopen` as an alternative of compile time linking. This is to address the size bloat issue with the PyPI binary release.
- Add CMake option to load `nccl` at runtime.
- Add an NCCL stub.

After this, `nccl` will be fetched from PyPI when using pip to install XGBoost, either by a user or by `pyproject.toml`. Others who want to link the nccl at compile time can continue to do so without any change.

At the moment, this is Linux only since we only support MNMG on Linux.
2023-11-22 19:27:31 +08:00

73 lines
3.0 KiB
Docker

ARG CUDA_VERSION_ARG
FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG
ARG RAPIDS_VERSION_ARG
# Install all basic requirements
RUN \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
> /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
yum install -y epel-release centos-release-scl && \
yum-config-manager --enable centos-sclo-rh-testing && \
yum -y update && \
yum install -y tar unzip wget xz git which ninja-build devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
# Python
wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
bash conda.sh -b -p /opt/mambaforge && \
/opt/mambaforge/bin/python -m pip install awscli && \
# CMake
wget -nv -nc https://cmake.org/files/v3.26/cmake-3.26.4-linux-x86_64.sh --no-check-certificate && \
bash cmake-3.26.4-linux-x86_64.sh --skip-license --prefix=/usr
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=$NCCL_VERSION_ARG && \
wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y update && \
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
ENV CUDAHOSTCXX=/opt/rh/devtoolset-9/root/usr/bin/c++
ENV GOSU_VERSION 1.10
# Install RMM
RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \
pushd rmm && \
mkdir build && \
pushd build && \
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \
cmake --build . --target install && \
popd && \
popd && \
rm -rf rmm
# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 && \
pushd grpc && \
cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \
cmake --build build --target install && \
popd && \
rm -rf grpc
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true
# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/
WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]