sync Jun 1

2023-06-01 15:55:06 -07:00
parent c5b575e00e fa2ab1f021
commit 42867a4805
76 changed files with 1424 additions and 595 deletions
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04
 ARG CUDA_VERSION_ARG

 # Environment
@@ -7,22 +7,21 @@ ENV DEBIAN_FRONTEND noninteractive

 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
    apt-get update && \
-    apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
+    apt-get install -y wget git python3 python3-pip software-properties-common \
                       apt-transport-https ca-certificates gnupg-agent && \
-    wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    add-apt-repository -u 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main' && \
-    apt-get update && \
    apt-get install -y llvm-15 clang-tidy-15 clang-15 libomp-15-dev && \
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    apt-get install -y cmake

 # Set default clang-tidy version
 RUN \
    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 && \
    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 100

+RUN \
+    apt-get install libgtest-dev libgmock-dev -y
+
 # Install Python packages
 RUN \
    pip3 install pyyaml
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG

@@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell

 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
    # Python
@@ -25,7 +25,7 @@ RUN \
        python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
        dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
        numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python && \
+        pyspark>=3.4.0 cloudpickle cuda-python && \
    mamba clean --all && \
    conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

--- a/tests/ci_build/Dockerfile.gpu_jvm
+++ b/tests/ci_build/Dockerfile.gpu_jvm
@@ -1,53 +0,0 @@
-ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu16.04
-ARG CUDA_VERSION_ARG
-ARG JDK_VERSION=8
-ARG SPARK_VERSION=3.0.0
-
-# Environment
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install all basic requirements
-RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \
-    apt-get update && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
-    # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
-    bash conda.sh -b -p /opt/mambaforge && \
-    /opt/mambaforge/bin/pip install awscli && \
-    # Maven
-    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
-    # Spark
-    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
-    tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
-    ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
-
-ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH
-
-# Install Python packages
-RUN \
-    pip install numpy scipy pandas scikit-learn
-
-ENV GOSU_VERSION 1.10
-
-# Install lightweight sudo (not bound to TTY)
-RUN set -ex; \
-    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
-    chmod +x /usr/local/bin/gosu && \
-    gosu nobody true
-
-# Set default JDK version
-RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64
-
-# Default entry-point to use if running locally
-# It will preserve attributes of created files
-COPY entrypoint.sh /scripts/
-
-WORKDIR /workspace
-ENTRYPOINT ["/scripts/entrypoint.sh"]
--- a/tests/ci_build/Dockerfile.jvm_cross
+++ b/tests/ci_build/Dockerfile.jvm_cross
@@ -20,10 +20,14 @@ RUN \
    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
-    # Spark
-    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
-    tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
-    ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
+    # Spark with scala 2.12
+    mkdir -p /opt/spark-scala-2.12 && \
+    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
+    tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \
+    # Spark with scala 2.13
+    mkdir -p /opt/spark-scala-2.13 && \
+    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \
+    tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13

 ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH

--- a/tests/ci_build/build_jvm_packages.sh
+++ b/tests/ci_build/build_jvm_packages.sh
@@ -6,6 +6,7 @@ set -x
 spark_version=$1
 use_cuda=$2
 gpu_arch=$3
+use_scala213=$4

 gpu_options=""
 if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
@@ -22,7 +23,13 @@ export RABIT_MOCK=ON
 if [ "x$gpu_arch" != "x" ]; then
  export GPU_ARCH_FLAG=$gpu_arch
 fi
-mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
+
+mvn_profile_string=""
+if [ "x$use_scala213" != "x" ]; then
+  export mvn_profile_string="-Pdefault,scala-2.13"
+fi
+
+mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options

 set +x
 set +e
--- a/tests/ci_build/conda_env/aarch64_test.yml
+++ b/tests/ci_build/conda_env/aarch64_test.yml
@@ -28,7 +28,7 @@ dependencies:
 - llvmlite
 - cffi
 - pyarrow
- pyspark
+- pyspark>=3.4.0
 - cloudpickle
 - pip:
  - awscli
--- a/tests/ci_build/conda_env/linux_cpu_test.yml
+++ b/tests/ci_build/conda_env/linux_cpu_test.yml
@@ -38,8 +38,6 @@ dependencies:
 - protobuf
 - cloudpickle
 - modin
-# TODO: Replace it with pyspark>=3.4 once 3.4 released.
-# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
- pyspark>=3.3.1
+- pyspark>=3.4.0
 - pip:
  - datatable
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@@ -35,7 +35,7 @@ dependencies:
 - py-ubjson
 - cffi
 - pyarrow
- pyspark
+- pyspark>=3.4.0
 - cloudpickle
 - pip:
  - sphinx_rtd_theme
--- a/tests/ci_build/conda_env/python_lint.yml
+++ b/tests/ci_build/conda_env/python_lint.yml
@@ -19,6 +19,4 @@ dependencies:
 - pytest
 - hypothesis
 - hatchling
- pip:
-  # TODO: Replace it with pyspark>=3.4 once 3.4 released.
-  - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
+- pyspark>=3.4.0
--- a/tests/ci_build/test_jvm_cross.sh
+++ b/tests/ci_build/test_jvm_cross.sh
@@ -6,37 +6,56 @@ set -x
 # Initialize local Maven repository
 ./tests/ci_build/initialize_maven.sh

-# Get version number of XGBoost4J and other auxiliary information
 cd jvm-packages
+jvm_packages_dir=`pwd`
+# Get version number of XGBoost4J and other auxiliary information
 xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
 maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout)
 maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout)
 spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout)
-scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
-scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)

-# Install XGBoost4J JAR into local Maven repository
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
-
-cd xgboost4j-tester
-# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
-python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
-# Run unit tests with XGBoost4J
-mvn --no-transfer-progress package
-
-# Run integration tests with XGBoost4J
-java -jar ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar
-
-# Run integration tests with XGBoost4J-Spark
-if [ ! -z "$RUN_INTEGRATION_TEST" ]
-then
+if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
+  cd $jvm_packages_dir/xgboost4j-tester
  python3 get_iris.py
-  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
-  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
+  cd $jvm_packages_dir
 fi

+# including maven profiles for different scala versions: 2.12 is the default at the moment.
+for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
+  scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
+  scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
+
+  # Install XGBoost4J JAR into local Maven repository
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
+
+  cd xgboost4j-tester
+  # Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
+  python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
+  # Build package and unit tests with XGBoost4J
+  mvn --no-transfer-progress clean package
+  xgboost4j_tester_jar="$jvm_packages_dir/xgboost4j-tester/target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar"
+  # Run integration tests with XGBoost4J
+  java -jar $xgboost4j_tester_jar
+
+  # Run integration tests with XGBoost4J-Spark
+  if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
+    # Changing directory so that we do not mix code and resulting files
+    cd target
+    if [[ "$scala_binary_version" == "2.12" ]]; then
+       /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
+       /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
+    elif [[ "$scala_binary_version" == "2.13" ]]; then
+      /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
+      /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
+    else
+      echo "Unexpected scala version: $scala_version ($scala_binary_version)."
+    fi
+  fi
+  cd $jvm_packages_dir
+done
+
 set +x
 set +e
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@@ -41,7 +41,7 @@ class ClangTidy(object):
    def __init__(self, args):
        self.cpp_lint = args.cpp
        self.cuda_lint = args.cuda
-        self.use_dmlc_gtest = args.use_dmlc_gtest
+        self.use_dmlc_gtest: bool = args.use_dmlc_gtest
        self.cuda_archs = args.cuda_archs.copy() if args.cuda_archs else []

        if args.tidy_version:
@@ -202,6 +202,7 @@ class ClangTidy(object):
        cdb_file = os.path.join(self.cdb_path, 'compile_commands.json')
        with open(cdb_file, 'r') as fd:
            self.compile_commands = json.load(fd)
+
        tidy_file = os.path.join(self.root_path, '.clang-tidy')
        with open(tidy_file) as fd:
            self.clang_tidy = yaml.safe_load(fd)
@@ -276,16 +277,24 @@ right keywords?
    print('clang-tidy is working.')


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Run clang-tidy.')
-    parser.add_argument('--cpp', type=int, default=1)
-    parser.add_argument('--tidy-version', type=int, default=None,
-                        help='Specify the version of preferred clang-tidy.')
-    parser.add_argument('--cuda', type=int, default=1)
-    parser.add_argument('--use-dmlc-gtest', type=int, default=1,
-                        help='Whether to use gtest bundled in dmlc-core.')
-    parser.add_argument('--cuda-archs', action='append',
-                        help='List of CUDA archs to build')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run clang-tidy.")
+    parser.add_argument("--cpp", type=int, default=1)
+    parser.add_argument(
+        "--tidy-version",
+        type=int,
+        default=None,
+        help="Specify the version of preferred clang-tidy.",
+    )
+    parser.add_argument("--cuda", type=int, default=1)
+    parser.add_argument(
+        "--use-dmlc-gtest",
+        action="store_true",
+        help="Whether to use gtest bundled in dmlc-core.",
+    )
+    parser.add_argument(
+        "--cuda-archs", action="append", help="List of CUDA archs to build"
+    )
    args = parser.parse_args()

    test_tidy(args)