diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index a2d8bb69a..79aac0f0b 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -75,3 +75,13 @@ jobs: if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows env: RABIT_MOCK: ON + + + - name: Build and Test XGBoost4J with scala 2.13 + run: | + rm -rfv build/ + cd jvm-packages + mvn -B clean install test -Pdefault,scala-2.13 + if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + env: + RABIT_MOCK: ON diff --git a/jvm-packages/.gitignore b/jvm-packages/.gitignore index 6d3f7b7cb..e2dc7967a 100644 --- a/jvm-packages/.gitignore +++ b/jvm-packages/.gitignore @@ -1,2 +1,4 @@ tracker.py build.sh +xgboost4j-tester/pom.xml +xgboost4j-tester/iris.csv diff --git a/jvm-packages/README.md b/jvm-packages/README.md index c4c8898dd..239464342 100644 --- a/jvm-packages/README.md +++ b/jvm-packages/README.md @@ -36,6 +36,19 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5. latest_version_num ``` +or +``` + + ml.dmlc + xgboost4j_2.13 + latest_version_num + + + ml.dmlc + xgboost4j-spark_2.13 + latest_version_num + +``` sbt ```sbt @@ -47,7 +60,6 @@ libraryDependencies ++= Seq( For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). -To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead. ### Access SNAPSHOT version @@ -85,6 +97,19 @@ Then add XGBoost4J as a dependency: latest_version_num-SNAPSHOT ``` +or with scala 2.13 +``` + + ml.dmlc + xgboost4j_2.13 + latest_version_num-SNAPSHOT + + + ml.dmlc + xgboost4j-spark_2.13 + latest_version_num-SNAPSHOT + +``` sbt ```sbt @@ -96,7 +121,9 @@ libraryDependencies ++= Seq( For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html). +### GPU algorithm To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead. +Note that scala 2.13 is not supported by the [NVIDIA/spark-rapids#1525](https://github.com/NVIDIA/spark-rapids/issues/1525) yet, so the GPU algorithm can only be used with scala 2.12. ## Examples diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index c80bf8bc6..fa0cb5344 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -5,7 +5,7 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT pom XGBoost JVM Package @@ -34,6 +34,7 @@ 1.8 1.8 1.17.1 + 4.13.2 3.4.0 3.3.2 2.12.17 @@ -45,7 +46,9 @@ 23.04.0 23.04.1 cuda11 - + 3.2.16 + 2.9.0 + central_maven @@ -71,6 +74,14 @@ + + scala-2.13 + + 2.13 + 2.13.10 + + + gpu @@ -467,6 +478,7 @@ + com.esotericsoftware kryo @@ -483,6 +495,11 @@ scala-library ${scala.version} + + org.scala-lang.modules + scala-collection-compat_${scala.binary.version} + ${scala-collection-compat.version} + commons-logging commons-logging @@ -491,13 +508,13 @@ org.scalatest scalatest_${scala.binary.version} - 3.2.16 + ${scalatest.version} test org.scalactic scalactic_${scala.binary.version} - 3.2.15 + ${scalatest.version} test diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml index 40c9c72a4..e6ed8a600 100644 --- a/jvm-packages/xgboost4j-example/pom.xml +++ b/jvm-packages/xgboost4j-example/pom.xml @@ -5,10 +5,11 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT - xgboost4j-example_2.12 + xgboost4j-example + xgboost4j-example_${scala.binary.version} 2.0.0-SNAPSHOT jar diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala index cb859f62d..3bfefb841 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala @@ -73,12 +73,13 @@ object DistTrainWithFlink { .map(_.f1.f0) .returns(testDataTypeHint) - val paramMap = mapAsJavaMap(Map( - ("eta", "0.1".asInstanceOf[AnyRef]), - ("max_depth", "2"), - ("objective", "binary:logistic"), - ("verbosity", "1") - )) + val paramMap = Map( + ("eta", "0.1".asInstanceOf[AnyRef]), + ("max_depth", "2"), + ("objective", "binary:logistic"), + ("verbosity", "1") + ) + .asJava // number of iterations val round = 2 diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala index 6d676b0ae..b8da31c09 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala @@ -20,10 +20,9 @@ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature._ import org.apache.spark.ml.tuning._ -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.types._ - -import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel} +import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris) @@ -50,6 +49,13 @@ object SparkMLlibPipeline { .appName("XGBoost4J-Spark Pipeline Example") .getOrCreate() + run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers) + .show(false) + } + private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String, + pipelineModelPath: String, treeMethod: String, + numWorkers: Int): DataFrame = { + // Load dataset val schema = new StructType(Array( StructField("sepal length", DoubleType, true), @@ -90,11 +96,11 @@ object SparkMLlibPipeline { val labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("realLabel") - .setLabels(labelIndexer.labels) + .setLabels(labelIndexer.labelsArray(0)) val pipeline = new Pipeline() .setStages(Array(assembler, labelIndexer, booster, labelConverter)) - val model = pipeline.fit(training) + val model: PipelineModel = pipeline.fit(training) // Batch prediction val prediction = model.transform(test) @@ -136,6 +142,6 @@ object SparkMLlibPipeline { // Load a saved model and serving val model2 = PipelineModel.load(pipelineModelPath) - model2.transform(test).show(false) + model2.transform(test) } } diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala index 17a32bc09..a7886f524 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala @@ -17,9 +17,8 @@ package ml.dmlc.xgboost4j.scala.example.spark import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier - import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris) @@ -38,6 +37,12 @@ object SparkTraining { val spark = SparkSession.builder().getOrCreate() val inputPath = args(0) + val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers) + results.show() + } + +private[spark] def run(spark: SparkSession, inputPath: String, + treeMethod: String, numWorkers: Int): DataFrame = { val schema = new StructType(Array( StructField("sepal length", DoubleType, true), StructField("sepal width", DoubleType, true), @@ -81,7 +86,6 @@ object SparkTraining { setFeaturesCol("features"). setLabelCol("classIndex") val xgbClassificationModel = xgbClassifier.fit(train) - val results = xgbClassificationModel.transform(test) - results.show() + xgbClassificationModel.transform(test) } } diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala new file mode 100644 index 000000000..f6cb700df --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala @@ -0,0 +1,123 @@ +/* + Copyright (c) 2014-2023 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package ml.dmlc.xgboost4j.scala.example.spark + +import org.apache.spark.sql.SparkSession +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite +import org.slf4j.LoggerFactory + +import java.io.File +import java.nio.file.{Files, StandardOpenOption} +import scala.jdk.CollectionConverters._ +import scala.util.{Random, Try} + +class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll { + private val logger = LoggerFactory.getLogger(classOf[SparkExamplesTest]) + private val random = new Random(42) + protected val numWorkers: Int = scala.math.min(Runtime.getRuntime.availableProcessors(), 4) + + private val pathToTestDataset = Files.createTempFile("", "iris.csv").toAbsolutePath + private var spark: SparkSession = _ + + override def beforeAll(): Unit = { + + def generateLine(i: Int): String = { + val getIrisName = (int: Int) => { + int % 3 match { + case 0 => "Iris-versicolor" + case 1 => "Iris-virginica" + case 2 => "Iris-setosa" + } + } + val generateValue = () => Math.abs(random.nextInt(99) * 0.1) + val sepalLength = generateValue() + val sepalWidth = generateValue() + val petalLength = generateValue() + val petalWidth = generateValue() + val irisName = getIrisName(Math.abs(random.nextInt()) + i) + s"$sepalLength,$sepalWidth,$petalLength,$petalWidth,$irisName" + } + + if (spark == null) { + spark = SparkSession + .builder() + .appName("XGBoost4J-Spark Pipeline Example") + .master(s"local[${numWorkers}]") + .config("spark.ui.enabled", value = false) + .config("spark.driver.memory", "512m") + .config("spark.barrier.sync.timeout", 10) + .config("spark.task.cpus", 1) + .getOrCreate() + spark.sparkContext.setLogLevel("ERROR") + } + val data = (0 until 150) + .map(i => generateLine(i)) + .toList + .asJava + Files.write(pathToTestDataset, + data, + StandardOpenOption.CREATE, + StandardOpenOption.WRITE, + StandardOpenOption.TRUNCATE_EXISTING) + logger.info(s"${new String(Files.readAllBytes(pathToTestDataset))}") + + } + + override def afterAll(): Unit = { + if (spark != null) { + spark.stop() + cleanExternalCache(spark.sparkContext.appName) + spark = null + } + + Try(Files.deleteIfExists(pathToTestDataset)) + .recover { + case e => + logger.warn( + s"Could not delete temporary file $pathToTestDataset. Please, remove it manually", + e + ) + true + } + } + + private def cleanExternalCache(prefix: String): Unit = { + val dir = new File(".") + for (file <- dir.listFiles() if file.getName.startsWith(prefix)) { + file.delete() + } + } + + test("Smoke test for SparkMLlibPipeline example") { + SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model", + "target/pipeline-model", "auto", 2) + } + + test("Smoke test for SparkTraining example") { + val spark = SparkSession + .builder() + .appName("XGBoost4J-Spark Pipeline Example") + .master(s"local[${numWorkers}]") + .config("spark.ui.enabled", value = false) + .config("spark.driver.memory", "512m") + .config("spark.barrier.sync.timeout", 10) + .config("spark.task.cpus", 1) + .getOrCreate() + + SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2) + } +} diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml index a9a80e29a..8d51a9dcf 100644 --- a/jvm-packages/xgboost4j-flink/pom.xml +++ b/jvm-packages/xgboost4j-flink/pom.xml @@ -5,9 +5,11 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT + + xgboost4j-flink xgboost4j-flink_${scala.binary.version} 2.0.0-SNAPSHOT diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml index 1d7a06708..f34680302 100644 --- a/jvm-packages/xgboost4j-gpu/pom.xml +++ b/jvm-packages/xgboost4j-gpu/pom.xml @@ -5,10 +5,11 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT - xgboost4j-gpu_2.12 + xgboost4j-gpu_${scala.binary.version} + xgboost4j-gpu 2.0.0-SNAPSHOT jar @@ -35,13 +36,13 @@ junit junit - 4.13.2 + ${junit.version} test org.scalatest scalatest_${scala.binary.version} - 3.2.15 + ${scalatest.version} provided diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml index 57770be5a..b7be69e69 100644 --- a/jvm-packages/xgboost4j-spark-gpu/pom.xml +++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml @@ -5,10 +5,11 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT - xgboost4j-spark-gpu_2.12 + xgboost4j-spark-gpu + xgboost4j-spark-gpu_${scala.binary.version} @@ -24,7 +25,7 @@ ml.dmlc xgboost4j-gpu_${scala.binary.version} - 2.0.0-SNAPSHOT + ${project.version} org.apache.spark diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml index 3a84233d1..d8f4cb914 100644 --- a/jvm-packages/xgboost4j-spark/pom.xml +++ b/jvm-packages/xgboost4j-spark/pom.xml @@ -5,10 +5,11 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT - xgboost4j-spark_2.12 + xgboost4j-spark + xgboost4j-spark_${scala.binary.version} @@ -24,7 +25,7 @@ ml.dmlc xgboost4j_${scala.binary.version} - 2.0.0-SNAPSHOT + ${project.version} org.apache.spark diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py index cc391dd00..b9c274c28 100644 --- a/jvm-packages/xgboost4j-tester/generate_pom.py +++ b/jvm-packages/xgboost4j-tester/generate_pom.py @@ -8,25 +8,28 @@ pom_template = """ 4.0.0 ml.dmlc - xgboost4j-tester_2.12 + xgboost4j-tester_{scala_binary_version} 1.0-SNAPSHOT - xgboost4j-tester_2.12 + xgboost4j-tester UTF-8 {maven_compiler_source} {maven_compiler_target} + 4.13.2 {spark_version} {scala_version} + 3.2.15 {scala_binary_version} + 5.5.0 - + com.esotericsoftware kryo - 4.0.2 + ${{kryo.version}} org.scala-lang @@ -48,29 +51,12 @@ pom_template = """ commons-logging 1.2 - - com.typesafe.akka - akka-testkit_${{scala.binary.version}} - 2.6.20 - test - org.scalatest scalatest_${{scala.binary.version}} - 3.0.8 + ${{scalatest.version}} test - - org.scalactic - scalactic_${{scala.binary.version}} - 3.2.15 - test - - - org.apache.commons - commons-lang3 - 3.9 - org.apache.spark spark-core_${{scala.binary.version}} @@ -92,7 +78,7 @@ pom_template = """ junit junit - 4.13.2 + ${{junit.version}} test @@ -122,36 +108,9 @@ pom_template = """ - - - maven-clean-plugin - 3.1.0 - - - - maven-resources-plugin - 3.0.2 - - - maven-compiler-plugin - 3.8.0 - - - maven-jar-plugin - 3.0.2 - - - maven-install-plugin - 2.5.2 - - - maven-deploy-plugin - 2.8.2 - org.apache.maven.plugins maven-assembly-plugin - 2.4 jar-with-dependencies @@ -171,22 +130,12 @@ pom_template = """ - - - maven-site-plugin - 3.7.1 - - - maven-project-info-reports-plugin - 3.0.0 - org.apache.maven.plugins maven-surefire-plugin - 2.22.1 - ml.dmlc:xgboost4j_2.12 + ml.dmlc:xgboost4j_${{scala.binary.version}} diff --git a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java b/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java deleted file mode 100644 index 2df693748..000000000 --- a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java +++ /dev/null @@ -1,20 +0,0 @@ -package ml.dmlc.xgboost4j.tester; - -import static org.junit.Assert.assertTrue; - -import org.junit.Test; - -/** - * Unit test for simple App. - */ -public class AppTest -{ - /** - * Rigorous Test :-) - */ - @Test - public void shouldAnswerWithTrue() - { - assertTrue( true ); - } -} diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml index 17ed66a12..4352aab12 100644 --- a/jvm-packages/xgboost4j/pom.xml +++ b/jvm-packages/xgboost4j/pom.xml @@ -5,10 +5,11 @@ 4.0.0 ml.dmlc - xgboost-jvm_2.12 + xgboost-jvm 2.0.0-SNAPSHOT - xgboost4j_2.12 + xgboost4j + xgboost4j_${scala.binary.version} 2.0.0-SNAPSHOT jar @@ -28,13 +29,13 @@ junit junit - 4.13.2 + ${junit.version} test org.scalatest scalatest_${scala.binary.version} - 3.2.16 + ${scalatest.version} provided diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala index 587ace352..fe17804fd 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala @@ -37,7 +37,7 @@ trait EvalTrait extends IEvaluation { */ def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float - private[scala] def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = { + def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = { require(predicts.length == jdmat.getLabel.length, "predicts size and label size must match " + s" predicts size: ${predicts.length}, label size: ${jdmat.getLabel.length}") eval(predicts, new DMatrix(jdmat)) diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala index 24e603762..de218f0c5 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala @@ -31,7 +31,7 @@ trait ObjectiveTrait extends IObjective { */ def getGradient(predicts: Array[Array[Float]], dtrain: DMatrix): List[Array[Float]] - private[scala] def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix): + def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix): java.util.List[Array[Float]] = { getGradient(predicts, new DMatrix(dtrain)).asJava } diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala index 90d06c343..50d86c893 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala @@ -17,12 +17,11 @@ package ml.dmlc.xgboost4j.scala import java.io.InputStream +import ml.dmlc.xgboost4j.java.{XGBoostError, XGBoost => JXGBoost} -import ml.dmlc.xgboost4j.java.{XGBoostError, Booster => JBooster, XGBoost => JXGBoost} -import scala.collection.JavaConverters._ - +import scala.jdk.CollectionConverters._ import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.fs.Path /** * XGBoost Scala Training function. @@ -40,7 +39,12 @@ object XGBoost { earlyStoppingRound: Int = 0, prevBooster: Booster, checkpointParams: Option[ExternalCheckpointParams]): Booster = { - val jWatches = watches.mapValues(_.jDMatrix).asJava + + // we have to filter null value for customized obj and eval + val jParams: java.util.Map[String, AnyRef] = + params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).toMap.asJava + + val jWatches = watches.mapValues(_.jDMatrix).toMap.asJava val jBooster = if (prevBooster == null) { null } else { @@ -51,8 +55,7 @@ object XGBoost { map(cp => { JXGBoost.trainAndSaveCheckpoint( dtrain.jDMatrix, - // we have to filter null value for customized obj and eval - params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava, + jParams, numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster, cp.checkpointInterval, cp.checkpointPath, @@ -61,8 +64,7 @@ object XGBoost { getOrElse( JXGBoost.train( dtrain.jDMatrix, - // we have to filter null value for customized obj and eval - params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava, + jParams, numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster) ) if (prevBooster == null) { diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh index 1de43bbd0..33cfffe71 100755 --- a/tests/buildkite/build-jvm-packages.sh +++ b/tests/buildkite/build-jvm-packages.sh @@ -4,11 +4,18 @@ set -euo pipefail source tests/buildkite/conftest.sh -echo "--- Build XGBoost JVM packages" +echo "--- Build XGBoost JVM packages scala 2.12" tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ ${SPARK_VERSION} + +echo "--- Build XGBoost JVM packages scala 2.13" + +tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ + ${SPARK_VERSION} "" "" "true" + echo "--- Stash XGBoost4J JARs" buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" +buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh index cf9270c11..957dd443c 100755 --- a/tests/buildkite/conftest.sh +++ b/tests/buildkite/conftest.sh @@ -25,7 +25,7 @@ set -x CUDA_VERSION=11.8.0 NCCL_VERSION=2.16.5-1 RAPIDS_VERSION=23.02 -SPARK_VERSION=3.1.1 +SPARK_VERSION=3.4.0 JDK_VERSION=8 if [[ -z ${BUILDKITE:-} ]] diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross index 6d9c5c57f..fdfae310a 100644 --- a/tests/ci_build/Dockerfile.jvm_cross +++ b/tests/ci_build/Dockerfile.jvm_cross @@ -20,10 +20,14 @@ RUN \ wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \ tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \ ln -s /opt/apache-maven-3.6.1/ /opt/maven && \ - # Spark - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \ - tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \ - ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark + # Spark with scala 2.12 + mkdir -p /opt/spark-scala-2.12 && \ + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \ + tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ + # Spark with scala 2.13 + mkdir -p /opt/spark-scala-2.13 && \ + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \ + tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh index 241fc445f..5797a1f61 100755 --- a/tests/ci_build/build_jvm_packages.sh +++ b/tests/ci_build/build_jvm_packages.sh @@ -6,6 +6,7 @@ set -x spark_version=$1 use_cuda=$2 gpu_arch=$3 +use_scala213=$4 gpu_options="" if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then @@ -22,7 +23,13 @@ export RABIT_MOCK=ON if [ "x$gpu_arch" != "x" ]; then export GPU_ARCH_FLAG=$gpu_arch fi -mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options + +mvn_profile_string="" +if [ "x$use_scala213" != "x" ]; then + export mvn_profile_string="-Pdefault,scala-2.13" +fi + +mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options set +x set +e diff --git a/tests/ci_build/test_jvm_cross.sh b/tests/ci_build/test_jvm_cross.sh index 378846d65..18265cf01 100755 --- a/tests/ci_build/test_jvm_cross.sh +++ b/tests/ci_build/test_jvm_cross.sh @@ -6,37 +6,56 @@ set -x # Initialize local Maven repository ./tests/ci_build/initialize_maven.sh -# Get version number of XGBoost4J and other auxiliary information cd jvm-packages +jvm_packages_dir=`pwd` +# Get version number of XGBoost4J and other auxiliary information xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout) maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout) spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout) -scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout) -scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout) -# Install XGBoost4J JAR into local Maven repository -mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar -mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests -mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar -mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar - -cd xgboost4j-tester -# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests -python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version} -# Run unit tests with XGBoost4J -mvn --no-transfer-progress package - -# Run integration tests with XGBoost4J -java -jar ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar - -# Run integration tests with XGBoost4J-Spark -if [ ! -z "$RUN_INTEGRATION_TEST" ] -then +if [ ! -z "$RUN_INTEGRATION_TEST" ]; then + cd $jvm_packages_dir/xgboost4j-tester python3 get_iris.py - spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv - spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model + cd $jvm_packages_dir fi +# including maven profiles for different scala versions: 2.12 is the default at the moment. +for _maven_profile_string in "" "-Pdefault,scala-2.13"; do + scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout) + scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout) + + # Install XGBoost4J JAR into local Maven repository + mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar + mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests + mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar + mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar + + cd xgboost4j-tester + # Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests + python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version} + # Build package and unit tests with XGBoost4J + mvn --no-transfer-progress clean package + xgboost4j_tester_jar="$jvm_packages_dir/xgboost4j-tester/target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar" + # Run integration tests with XGBoost4J + java -jar $xgboost4j_tester_jar + + # Run integration tests with XGBoost4J-Spark + if [ ! -z "$RUN_INTEGRATION_TEST" ]; then + # Changing directory so that we do not mix code and resulting files + cd target + if [[ "$scala_binary_version" == "2.12" ]]; then + /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv + /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version} + elif [[ "$scala_binary_version" == "2.13" ]]; then + /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv + /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version} + else + echo "Unexpected scala version: $scala_version ($scala_binary_version)." + fi + fi + cd $jvm_packages_dir +done + set +x set +e