[BLOCKING] [jvm-packages] add gpu_hist and enable gpu scheduling (#5171)
* [jvm-packages] add gpu_hist tree method * change updater hist to grow_quantile_histmaker * add gpu scheduling * pass correct parameters to xgboost library * remove debug info * add use.cuda for pom * add CI for gpu_hist for jvm * add gpu unit tests * use gpu node to build jvm * use nvidia-docker * Add CLI interface to create_jni.py using argparse Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
51
tests/ci_build/Dockerfile.gpu_jvm
Normal file
51
tests/ci_build/Dockerfile.gpu_jvm
Normal file
@@ -0,0 +1,51 @@
|
||||
ARG CUDA_VERSION
|
||||
FROM nvidia/cuda:$CUDA_VERSION-runtime-ubuntu16.04
|
||||
ARG JDK_VERSION=8
|
||||
ARG SPARK_VERSION=3.0.0
|
||||
|
||||
# Environment
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
# Install all basic requirements
|
||||
RUN \
|
||||
apt-get update && \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository ppa:openjdk-r/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
|
||||
# Python
|
||||
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
|
||||
bash Miniconda3.sh -b -p /opt/python && \
|
||||
/opt/python/bin/pip install awscli && \
|
||||
# Maven
|
||||
wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
|
||||
# Spark
|
||||
wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
|
||||
tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
|
||||
ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
|
||||
|
||||
ENV PATH=/opt/python/bin:/opt/spark/bin:/opt/maven/bin:$PATH
|
||||
|
||||
# Install Python packages
|
||||
RUN \
|
||||
pip install numpy scipy pandas scikit-learn
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
|
||||
# Install lightweight sudo (not bound to TTY)
|
||||
RUN set -ex; \
|
||||
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
|
||||
chmod +x /usr/local/bin/gosu && \
|
||||
gosu nobody true
|
||||
|
||||
# Set default JDK version
|
||||
RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64
|
||||
|
||||
# Default entry-point to use if running locally
|
||||
# It will preserve attributes of created files
|
||||
COPY entrypoint.sh /scripts/
|
||||
|
||||
WORKDIR /workspace
|
||||
ENTRYPOINT ["/scripts/entrypoint.sh"]
|
||||
63
tests/ci_build/Dockerfile.jvm_gpu_build
Normal file
63
tests/ci_build/Dockerfile.jvm_gpu_build
Normal file
@@ -0,0 +1,63 @@
|
||||
ARG CUDA_VERSION
|
||||
FROM nvidia/cuda:$CUDA_VERSION-devel-centos6
|
||||
ARG CUDA_VERSION
|
||||
|
||||
# Environment
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
ENV DEVTOOLSET_URL_ROOT http://vault.centos.org/6.9/sclo/x86_64/rh/devtoolset-4/
|
||||
|
||||
# Install all basic requirements
|
||||
RUN \
|
||||
yum -y update && \
|
||||
yum install -y tar unzip wget xz git centos-release-scl yum-utils java-1.8.0-openjdk-devel && \
|
||||
yum-config-manager --enable centos-sclo-rh-testing && \
|
||||
yum -y update && \
|
||||
yum install -y $DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-5.3.1-6.1.el6.x86_64.rpm \
|
||||
$DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-c++-5.3.1-6.1.el6.x86_64.rpm \
|
||||
$DEVTOOLSET_URL_ROOT/devtoolset-4-binutils-2.25.1-8.el6.x86_64.rpm \
|
||||
$DEVTOOLSET_URL_ROOT/devtoolset-4-runtime-4.1-3.sc1.el6.x86_64.rpm \
|
||||
$DEVTOOLSET_URL_ROOT/devtoolset-4-libstdc++-devel-5.3.1-6.1.el6.x86_64.rpm && \
|
||||
# Python
|
||||
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
|
||||
bash Miniconda3.sh -b -p /opt/python && \
|
||||
# CMake
|
||||
wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
|
||||
bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
|
||||
# Maven
|
||||
wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
|
||||
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
|
||||
ln -s /opt/apache-maven-3.6.1/ /opt/maven
|
||||
|
||||
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
|
||||
RUN \
|
||||
export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
|
||||
export NCCL_VERSION=2.4.8-1 && \
|
||||
wget https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
|
||||
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
|
||||
yum -y update && \
|
||||
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
|
||||
rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
|
||||
|
||||
ENV PATH=/opt/python/bin:/opt/maven/bin:$PATH
|
||||
ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
|
||||
ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
|
||||
ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
|
||||
|
||||
# Install Python packages
|
||||
RUN \
|
||||
pip install numpy pytest scipy scikit-learn wheel kubernetes urllib3==1.22 awscli
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
|
||||
# Install lightweight sudo (not bound to TTY)
|
||||
RUN set -ex; \
|
||||
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
|
||||
chmod +x /usr/local/bin/gosu && \
|
||||
gosu nobody true
|
||||
|
||||
# Default entry-point to use if running locally
|
||||
# It will preserve attributes of created files
|
||||
COPY entrypoint.sh /scripts/
|
||||
|
||||
WORKDIR /workspace
|
||||
ENTRYPOINT ["/scripts/entrypoint.sh"]
|
||||
@@ -3,12 +3,15 @@
|
||||
set -e
|
||||
set -x
|
||||
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 [spark version]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
spark_version=$1
|
||||
use_cuda=$2
|
||||
gpu_arch=$3
|
||||
|
||||
gpu_options=""
|
||||
if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
|
||||
# Since building jvm for CPU will do unit tests, choose gpu-with-gpu-tests profile to build
|
||||
gpu_options=" -Pgpu-with-gpu-tests "
|
||||
fi
|
||||
|
||||
# Initialize local Maven repository
|
||||
./tests/ci_build/initialize_maven.sh
|
||||
@@ -16,7 +19,11 @@ spark_version=$1
|
||||
rm -rf build/
|
||||
cd jvm-packages
|
||||
export RABIT_MOCK=ON
|
||||
mvn --no-transfer-progress package -Dspark.version=${spark_version}
|
||||
|
||||
if [ "x$gpu_arch" != "x" ]; then
|
||||
export GPU_ARCH_FLAG=$gpu_arch
|
||||
fi
|
||||
mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
|
||||
|
||||
set +x
|
||||
set +e
|
||||
|
||||
40
tests/ci_build/test_jvm_gpu_cross.sh
Executable file
40
tests/ci_build/test_jvm_gpu_cross.sh
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
|
||||
nvidia-smi
|
||||
|
||||
ls /usr/local/
|
||||
|
||||
# Initialize local Maven repository
|
||||
./tests/ci_build/initialize_maven.sh
|
||||
|
||||
# Get version number of XGBoost4J and other auxiliary information
|
||||
cd jvm-packages
|
||||
xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
|
||||
scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
|
||||
|
||||
python3 xgboost4j-tester/get_iris.py
|
||||
xgb_jars="./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar,./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar"
|
||||
example_jar="./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar"
|
||||
|
||||
echo "Run SparkTraining locally ... "
|
||||
spark-submit \
|
||||
--master 'local[1]' \
|
||||
--class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining \
|
||||
--jars $xgb_jars \
|
||||
$example_jar \
|
||||
${PWD}/iris.csv gpu \
|
||||
|
||||
echo "Run SparkMLlibPipeline locally ... "
|
||||
spark-submit \
|
||||
--master 'local[1]' \
|
||||
--class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline \
|
||||
--jars $xgb_jars \
|
||||
$example_jar \
|
||||
${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model gpu \
|
||||
|
||||
set +x
|
||||
set +e
|
||||
Reference in New Issue
Block a user