PySpark XGBoost integration (#8020)

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
WeichenXu
2022-07-13 13:11:18 +08:00
committed by GitHub
parent 8959622836
commit 176fec8789
25 changed files with 3650 additions and 12 deletions

View File

@@ -10,7 +10,7 @@ RUN \
apt-get install -y software-properties-common && \
add-apt-repository ppa:ubuntu-toolchain-r/test && \
apt-get update && \
apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 && \
apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
# CMake
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
@@ -24,6 +24,7 @@ ENV CXX=g++-8
ENV CPP=cpp-8
ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
# Create new Conda environment
COPY conda_env/cpu_test.yml /scripts/

View File

@@ -10,7 +10,7 @@ SHELL ["/bin/bash", "-c"] # Use Bash as shell
RUN \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
apt-get update && \
apt-get install -y wget unzip bzip2 libgomp1 build-essential && \
apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python
@@ -19,11 +19,14 @@ ENV PATH=/opt/python/bin:$PATH
# Create new Conda environment with cuDF, Dask, and cuPy
RUN \
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.8 cudf=22.04* rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=22.04* dask-cudf=22.04* cupy \
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
pyspark cloudpickle cuda-python=11.7.0
ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \

View File

@@ -28,6 +28,8 @@ dependencies:
- llvmlite
- cffi
- pyarrow
- pyspark
- cloudpickle
- pip:
- shap
- awscli

View File

@@ -36,6 +36,8 @@ dependencies:
- cffi
- pyarrow
- protobuf<=3.20
- pyspark
- cloudpickle
- pip:
- shap
- ipython # required by shap at import time.

View File

@@ -35,6 +35,8 @@ dependencies:
- py-ubjson
- cffi
- pyarrow
- pyspark
- cloudpickle
- pip:
- sphinx_rtd_theme
- datatable

View File

@@ -34,6 +34,18 @@ function install_xgboost {
fi
}
function setup_pyspark_envs {
export PYSPARK_DRIVER_PYTHON=`which python`
export PYSPARK_PYTHON=`which python`
export SPARK_TESTING=1
}
function unset_pyspark_envs {
unset PYSPARK_DRIVER_PYTHON
unset PYSPARK_PYTHON
unset SPARK_TESTING
}
function uninstall_xgboost {
pip uninstall -y xgboost
}
@@ -43,14 +55,18 @@ case "$suite" in
gpu)
source activate gpu_test
install_xgboost
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu
unset_pyspark_envs
uninstall_xgboost
;;
mgpu)
source activate gpu_test
install_xgboost
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
unset_pyspark_envs
cd tests/distributed
./runtests-gpu.sh
@@ -61,7 +77,9 @@ case "$suite" in
source activate cpu_test
install_xgboost
export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
unset_pyspark_envs
cd tests/distributed
./runtests.sh
uninstall_xgboost
@@ -70,7 +88,9 @@ case "$suite" in
cpu-arm64)
source activate aarch64_test
install_xgboost
setup_pyspark_envs
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py
unset_pyspark_envs
uninstall_xgboost
;;