[BLOCKING] [jvm-packages] add gpu_hist and enable gpu scheduling (#5171)

* [jvm-packages] add gpu_hist tree method

* change updater hist to grow_quantile_histmaker

* add gpu scheduling

* pass correct parameters to xgboost library

* remove debug info

* add use.cuda for pom

* add CI for gpu_hist for jvm

* add gpu unit tests

* use gpu node to build jvm

* use nvidia-docker

* Add CLI interface to create_jni.py using argparse

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Bobby Wang
2020-07-27 12:53:24 +08:00
committed by GitHub
parent 6347fa1c2e
commit 8943eb4314
18 changed files with 543 additions and 123 deletions

View File

@@ -0,0 +1,51 @@
ARG CUDA_VERSION
FROM nvidia/cuda:$CUDA_VERSION-runtime-ubuntu16.04
ARG JDK_VERSION=8
ARG SPARK_VERSION=3.0.0
# Environment
ENV DEBIAN_FRONTEND noninteractive
# Install all basic requirements
RUN \
apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:openjdk-r/ppa && \
apt-get update && \
apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
/opt/python/bin/pip install awscli && \
# Maven
wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
# Spark
wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
ENV PATH=/opt/python/bin:/opt/spark/bin:/opt/maven/bin:$PATH
# Install Python packages
RUN \
pip install numpy scipy pandas scikit-learn
ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true
# Set default JDK version
RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64
# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/
WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]

View File

@@ -0,0 +1,63 @@
ARG CUDA_VERSION
FROM nvidia/cuda:$CUDA_VERSION-devel-centos6
ARG CUDA_VERSION
# Environment
ENV DEBIAN_FRONTEND noninteractive
ENV DEVTOOLSET_URL_ROOT http://vault.centos.org/6.9/sclo/x86_64/rh/devtoolset-4/
# Install all basic requirements
RUN \
yum -y update && \
yum install -y tar unzip wget xz git centos-release-scl yum-utils java-1.8.0-openjdk-devel && \
yum-config-manager --enable centos-sclo-rh-testing && \
yum -y update && \
yum install -y $DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-5.3.1-6.1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-c++-5.3.1-6.1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-binutils-2.25.1-8.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-runtime-4.1-3.sc1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-libstdc++-devel-5.3.1-6.1.el6.x86_64.rpm && \
# Python
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
# CMake
wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Maven
wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
ln -s /opt/apache-maven-3.6.1/ /opt/maven
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.4.8-1 && \
wget https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y update && \
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} && \
rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
ENV PATH=/opt/python/bin:/opt/maven/bin:$PATH
ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
# Install Python packages
RUN \
pip install numpy pytest scipy scikit-learn wheel kubernetes urllib3==1.22 awscli
ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true
# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/
WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]

View File

@@ -3,12 +3,15 @@
set -e
set -x
if [ $# -ne 1 ]; then
echo "Usage: $0 [spark version]"
exit 1
fi
spark_version=$1
use_cuda=$2
gpu_arch=$3
gpu_options=""
if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
# Since building jvm for CPU will do unit tests, choose gpu-with-gpu-tests profile to build
gpu_options=" -Pgpu-with-gpu-tests "
fi
# Initialize local Maven repository
./tests/ci_build/initialize_maven.sh
@@ -16,7 +19,11 @@ spark_version=$1
rm -rf build/
cd jvm-packages
export RABIT_MOCK=ON
mvn --no-transfer-progress package -Dspark.version=${spark_version}
if [ "x$gpu_arch" != "x" ]; then
export GPU_ARCH_FLAG=$gpu_arch
fi
mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
set +x
set +e

View File

@@ -0,0 +1,40 @@
#!/bin/bash
set -e
set -x
nvidia-smi
ls /usr/local/
# Initialize local Maven repository
./tests/ci_build/initialize_maven.sh
# Get version number of XGBoost4J and other auxiliary information
cd jvm-packages
xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
python3 xgboost4j-tester/get_iris.py
xgb_jars="./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar,./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar"
example_jar="./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar"
echo "Run SparkTraining locally ... "
spark-submit \
--master 'local[1]' \
--class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining \
--jars $xgb_jars \
$example_jar \
${PWD}/iris.csv gpu \
echo "Run SparkMLlibPipeline locally ... "
spark-submit \
--master 'local[1]' \
--class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline \
--jars $xgb_jars \
$example_jar \
${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model gpu \
set +x
set +e