Scala 2.13 support. (#9099)

1. Updated the test logic
2. Added smoke tests for Spark examples.
3. Added integration tests for Spark with Scala 2.13
This commit is contained in:
Boris
2023-05-27 13:34:02 +02:00
committed by GitHub
parent 8c174ef2d3
commit a01df102c9
24 changed files with 325 additions and 160 deletions

View File

@@ -4,11 +4,18 @@ set -euo pipefail
source tests/buildkite/conftest.sh
echo "--- Build XGBoost JVM packages"
echo "--- Build XGBoost JVM packages scala 2.12"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION}
echo "--- Build XGBoost JVM packages scala 2.13"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} "" "" "true"
echo "--- Stash XGBoost4J JARs"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"

View File

@@ -25,7 +25,7 @@ set -x
CUDA_VERSION=11.8.0
NCCL_VERSION=2.16.5-1
RAPIDS_VERSION=23.02
SPARK_VERSION=3.1.1
SPARK_VERSION=3.4.0
JDK_VERSION=8
if [[ -z ${BUILDKITE:-} ]]

View File

@@ -20,10 +20,14 @@ RUN \
wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
# Spark
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
# Spark with scala 2.12
mkdir -p /opt/spark-scala-2.12 && \
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \
# Spark with scala 2.13
mkdir -p /opt/spark-scala-2.13 && \
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13
ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH

View File

@@ -6,6 +6,7 @@ set -x
spark_version=$1
use_cuda=$2
gpu_arch=$3
use_scala213=$4
gpu_options=""
if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
@@ -22,7 +23,13 @@ export RABIT_MOCK=ON
if [ "x$gpu_arch" != "x" ]; then
export GPU_ARCH_FLAG=$gpu_arch
fi
mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
mvn_profile_string=""
if [ "x$use_scala213" != "x" ]; then
export mvn_profile_string="-Pdefault,scala-2.13"
fi
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
set +x
set +e

View File

@@ -6,37 +6,56 @@ set -x
# Initialize local Maven repository
./tests/ci_build/initialize_maven.sh
# Get version number of XGBoost4J and other auxiliary information
cd jvm-packages
jvm_packages_dir=`pwd`
# Get version number of XGBoost4J and other auxiliary information
xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout)
maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout)
spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout)
scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
# Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
cd xgboost4j-tester
# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
# Run unit tests with XGBoost4J
mvn --no-transfer-progress package
# Run integration tests with XGBoost4J
java -jar ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar
# Run integration tests with XGBoost4J-Spark
if [ ! -z "$RUN_INTEGRATION_TEST" ]
then
if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
cd $jvm_packages_dir/xgboost4j-tester
python3 get_iris.py
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
cd $jvm_packages_dir
fi
# including maven profiles for different scala versions: 2.12 is the default at the moment.
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
# Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
cd xgboost4j-tester
# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
# Build package and unit tests with XGBoost4J
mvn --no-transfer-progress clean package
xgboost4j_tester_jar="$jvm_packages_dir/xgboost4j-tester/target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar"
# Run integration tests with XGBoost4J
java -jar $xgboost4j_tester_jar
# Run integration tests with XGBoost4J-Spark
if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
# Changing directory so that we do not mix code and resulting files
cd target
if [[ "$scala_binary_version" == "2.12" ]]; then
/opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
/opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
elif [[ "$scala_binary_version" == "2.13" ]]; then
/opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
/opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
else
echo "Unexpected scala version: $scala_version ($scala_binary_version)."
fi
fi
cd $jvm_packages_dir
done
set +x
set +e