Scala 2.13 support. (#9099)

1. Updated the test logic
2. Added smoke tests for Spark examples.
3. Added integration tests for Spark with Scala 2.13
This commit is contained in:
Boris 2023-05-27 13:34:02 +02:00 committed by GitHub
parent 8c174ef2d3
commit a01df102c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 325 additions and 160 deletions

View File

@ -75,3 +75,13 @@ jobs:
if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
env: env:
RABIT_MOCK: ON RABIT_MOCK: ON
- name: Build and Test XGBoost4J with scala 2.13
run: |
rm -rfv build/
cd jvm-packages
mvn -B clean install test -Pdefault,scala-2.13
if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
env:
RABIT_MOCK: ON

View File

@ -1,2 +1,4 @@
tracker.py tracker.py
build.sh build.sh
xgboost4j-tester/pom.xml
xgboost4j-tester/iris.csv

View File

@ -36,6 +36,19 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
<version>latest_version_num</version> <version>latest_version_num</version>
</dependency> </dependency>
``` ```
or
```
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_2.13</artifactId>
<version>latest_version_num</version>
</dependency>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_2.13</artifactId>
<version>latest_version_num</version>
</dependency>
```
<b>sbt</b> <b>sbt</b>
```sbt ```sbt
@ -47,7 +60,6 @@ libraryDependencies ++= Seq(
For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
### Access SNAPSHOT version ### Access SNAPSHOT version
@ -85,6 +97,19 @@ Then add XGBoost4J as a dependency:
<version>latest_version_num-SNAPSHOT</version> <version>latest_version_num-SNAPSHOT</version>
</dependency> </dependency>
``` ```
or with scala 2.13
```
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_2.13</artifactId>
<version>latest_version_num-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_2.13</artifactId>
<version>latest_version_num-SNAPSHOT</version>
</dependency>
```
<b>sbt</b> <b>sbt</b>
```sbt ```sbt
@ -96,7 +121,9 @@ libraryDependencies ++= Seq(
For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html). For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).
### GPU algorithm
To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead. To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
Note that scala 2.13 is not supported by the [NVIDIA/spark-rapids#1525](https://github.com/NVIDIA/spark-rapids/issues/1525) yet, so the GPU algorithm can only be used with scala 2.12.
## Examples ## Examples

View File

@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
@ -34,6 +34,7 @@
<maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.target>1.8</maven.compiler.target>
<flink.version>1.17.1</flink.version> <flink.version>1.17.1</flink.version>
<junit.version>4.13.2</junit.version>
<spark.version>3.4.0</spark.version> <spark.version>3.4.0</spark.version>
<spark.version.gpu>3.3.2</spark.version.gpu> <spark.version.gpu>3.3.2</spark.version.gpu>
<scala.version>2.12.17</scala.version> <scala.version>2.12.17</scala.version>
@ -45,7 +46,9 @@
<cudf.version>23.04.0</cudf.version> <cudf.version>23.04.0</cudf.version>
<spark.rapids.version>23.04.1</spark.rapids.version> <spark.rapids.version>23.04.1</spark.rapids.version>
<cudf.classifier>cuda11</cudf.classifier> <cudf.classifier>cuda11</cudf.classifier>
</properties> <scalatest.version>3.2.16</scalatest.version>
<scala-collection-compat.version>2.9.0</scala-collection-compat.version>
</properties>
<repositories> <repositories>
<repository> <repository>
<id>central_maven</id> <id>central_maven</id>
@ -71,6 +74,14 @@
</modules> </modules>
</profile> </profile>
<profile>
<id>scala-2.13</id>
<properties>
<scala.binary.version>2.13</scala.binary.version>
<scala.version>2.13.10</scala.version>
</properties>
</profile>
<!-- gpu profile with both cpu and gpu test suites --> <!-- gpu profile with both cpu and gpu test suites -->
<profile> <profile>
<id>gpu</id> <id>gpu</id>
@ -467,6 +478,7 @@
</plugins> </plugins>
</reporting> </reporting>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>com.esotericsoftware</groupId> <groupId>com.esotericsoftware</groupId>
<artifactId>kryo</artifactId> <artifactId>kryo</artifactId>
@ -483,6 +495,11 @@
<artifactId>scala-library</artifactId> <artifactId>scala-library</artifactId>
<version>${scala.version}</version> <version>${scala.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
<version>${scala-collection-compat.version}</version>
</dependency>
<dependency> <dependency>
<groupId>commons-logging</groupId> <groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId> <artifactId>commons-logging</artifactId>
@ -491,13 +508,13 @@
<dependency> <dependency>
<groupId>org.scalatest</groupId> <groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId> <artifactId>scalatest_${scala.binary.version}</artifactId>
<version>3.2.16</version> <version>${scalatest.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.scalactic</groupId> <groupId>org.scalactic</groupId>
<artifactId>scalactic_${scala.binary.version}</artifactId> <artifactId>scalactic_${scala.binary.version}</artifactId>
<version>3.2.15</version> <version>${scalatest.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@ -5,10 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
</parent> </parent>
<artifactId>xgboost4j-example_2.12</artifactId> <name>xgboost4j-example</name>
<artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>

View File

@ -73,12 +73,13 @@ object DistTrainWithFlink {
.map(_.f1.f0) .map(_.f1.f0)
.returns(testDataTypeHint) .returns(testDataTypeHint)
val paramMap = mapAsJavaMap(Map( val paramMap = Map(
("eta", "0.1".asInstanceOf[AnyRef]), ("eta", "0.1".asInstanceOf[AnyRef]),
("max_depth", "2"), ("max_depth", "2"),
("objective", "binary:logistic"), ("objective", "binary:logistic"),
("verbosity", "1") ("verbosity", "1")
)) )
.asJava
// number of iterations // number of iterations
val round = 2 val round = 2

View File

@ -20,10 +20,9 @@ import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature._ import org.apache.spark.ml.feature._
import org.apache.spark.ml.tuning._ import org.apache.spark.ml.tuning._
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}
// this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris) // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
@ -50,6 +49,13 @@ object SparkMLlibPipeline {
.appName("XGBoost4J-Spark Pipeline Example") .appName("XGBoost4J-Spark Pipeline Example")
.getOrCreate() .getOrCreate()
run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers)
.show(false)
}
private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String,
pipelineModelPath: String, treeMethod: String,
numWorkers: Int): DataFrame = {
// Load dataset // Load dataset
val schema = new StructType(Array( val schema = new StructType(Array(
StructField("sepal length", DoubleType, true), StructField("sepal length", DoubleType, true),
@ -90,11 +96,11 @@ object SparkMLlibPipeline {
val labelConverter = new IndexToString() val labelConverter = new IndexToString()
.setInputCol("prediction") .setInputCol("prediction")
.setOutputCol("realLabel") .setOutputCol("realLabel")
.setLabels(labelIndexer.labels) .setLabels(labelIndexer.labelsArray(0))
val pipeline = new Pipeline() val pipeline = new Pipeline()
.setStages(Array(assembler, labelIndexer, booster, labelConverter)) .setStages(Array(assembler, labelIndexer, booster, labelConverter))
val model = pipeline.fit(training) val model: PipelineModel = pipeline.fit(training)
// Batch prediction // Batch prediction
val prediction = model.transform(test) val prediction = model.transform(test)
@ -136,6 +142,6 @@ object SparkMLlibPipeline {
// Load a saved model and serving // Load a saved model and serving
val model2 = PipelineModel.load(pipelineModelPath) val model2 = PipelineModel.load(pipelineModelPath)
model2.transform(test).show(false) model2.transform(test)
} }
} }

View File

@ -17,9 +17,8 @@
package ml.dmlc.xgboost4j.scala.example.spark package ml.dmlc.xgboost4j.scala.example.spark
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
// this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris) // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
@ -38,6 +37,12 @@ object SparkTraining {
val spark = SparkSession.builder().getOrCreate() val spark = SparkSession.builder().getOrCreate()
val inputPath = args(0) val inputPath = args(0)
val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers)
results.show()
}
private[spark] def run(spark: SparkSession, inputPath: String,
treeMethod: String, numWorkers: Int): DataFrame = {
val schema = new StructType(Array( val schema = new StructType(Array(
StructField("sepal length", DoubleType, true), StructField("sepal length", DoubleType, true),
StructField("sepal width", DoubleType, true), StructField("sepal width", DoubleType, true),
@ -81,7 +86,6 @@ object SparkTraining {
setFeaturesCol("features"). setFeaturesCol("features").
setLabelCol("classIndex") setLabelCol("classIndex")
val xgbClassificationModel = xgbClassifier.fit(train) val xgbClassificationModel = xgbClassifier.fit(train)
val results = xgbClassificationModel.transform(test) xgbClassificationModel.transform(test)
results.show()
} }
} }

View File

@ -0,0 +1,123 @@
/*
Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.example.spark
import org.apache.spark.sql.SparkSession
import org.scalatest.BeforeAndAfterAll
import org.scalatest.funsuite.AnyFunSuite
import org.slf4j.LoggerFactory
import java.io.File
import java.nio.file.{Files, StandardOpenOption}
import scala.jdk.CollectionConverters._
import scala.util.{Random, Try}
class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
private val logger = LoggerFactory.getLogger(classOf[SparkExamplesTest])
private val random = new Random(42)
protected val numWorkers: Int = scala.math.min(Runtime.getRuntime.availableProcessors(), 4)
private val pathToTestDataset = Files.createTempFile("", "iris.csv").toAbsolutePath
private var spark: SparkSession = _
override def beforeAll(): Unit = {
def generateLine(i: Int): String = {
val getIrisName = (int: Int) => {
int % 3 match {
case 0 => "Iris-versicolor"
case 1 => "Iris-virginica"
case 2 => "Iris-setosa"
}
}
val generateValue = () => Math.abs(random.nextInt(99) * 0.1)
val sepalLength = generateValue()
val sepalWidth = generateValue()
val petalLength = generateValue()
val petalWidth = generateValue()
val irisName = getIrisName(Math.abs(random.nextInt()) + i)
s"$sepalLength,$sepalWidth,$petalLength,$petalWidth,$irisName"
}
if (spark == null) {
spark = SparkSession
.builder()
.appName("XGBoost4J-Spark Pipeline Example")
.master(s"local[${numWorkers}]")
.config("spark.ui.enabled", value = false)
.config("spark.driver.memory", "512m")
.config("spark.barrier.sync.timeout", 10)
.config("spark.task.cpus", 1)
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
}
val data = (0 until 150)
.map(i => generateLine(i))
.toList
.asJava
Files.write(pathToTestDataset,
data,
StandardOpenOption.CREATE,
StandardOpenOption.WRITE,
StandardOpenOption.TRUNCATE_EXISTING)
logger.info(s"${new String(Files.readAllBytes(pathToTestDataset))}")
}
override def afterAll(): Unit = {
if (spark != null) {
spark.stop()
cleanExternalCache(spark.sparkContext.appName)
spark = null
}
Try(Files.deleteIfExists(pathToTestDataset))
.recover {
case e =>
logger.warn(
s"Could not delete temporary file $pathToTestDataset. Please, remove it manually",
e
)
true
}
}
private def cleanExternalCache(prefix: String): Unit = {
val dir = new File(".")
for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
file.delete()
}
}
test("Smoke test for SparkMLlibPipeline example") {
SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model",
"target/pipeline-model", "auto", 2)
}
test("Smoke test for SparkTraining example") {
val spark = SparkSession
.builder()
.appName("XGBoost4J-Spark Pipeline Example")
.master(s"local[${numWorkers}]")
.config("spark.ui.enabled", value = false)
.config("spark.driver.memory", "512m")
.config("spark.barrier.sync.timeout", 10)
.config("spark.task.cpus", 1)
.getOrCreate()
SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2)
}
}

View File

@ -5,9 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
</parent> </parent>
<name>xgboost4j-flink</name>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
<properties> <properties>

View File

@ -5,10 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_2.12</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<name>xgboost4j-gpu</name>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>
@ -35,13 +36,13 @@
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<version>4.13.2</version> <version>${junit.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.scalatest</groupId> <groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId> <artifactId>scalatest_${scala.binary.version}</artifactId>
<version>3.2.15</version> <version>${scalatest.version}</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -5,10 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
</parent> </parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId> <name>xgboost4j-spark-gpu</name>
<artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@ -24,7 +25,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@ -5,10 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
</parent> </parent>
<artifactId>xgboost4j-spark_2.12</artifactId> <name>xgboost4j-spark</name>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@ -24,7 +25,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@ -8,25 +8,28 @@ pom_template = """
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-tester_2.12</artifactId> <artifactId>xgboost4j-tester_{scala_binary_version}</artifactId>
<version>1.0-SNAPSHOT</version> <version>1.0-SNAPSHOT</version>
<name>xgboost4j-tester_2.12</name> <name>xgboost4j-tester</name>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>{maven_compiler_source}</maven.compiler.source> <maven.compiler.source>{maven_compiler_source}</maven.compiler.source>
<maven.compiler.target>{maven_compiler_target}</maven.compiler.target> <maven.compiler.target>{maven_compiler_target}</maven.compiler.target>
<junit.version>4.13.2</junit.version>
<spark.version>{spark_version}</spark.version> <spark.version>{spark_version}</spark.version>
<scala.version>{scala_version}</scala.version> <scala.version>{scala_version}</scala.version>
<scalatest.version>3.2.15</scalatest.version>
<scala.binary.version>{scala_binary_version}</scala.binary.version> <scala.binary.version>{scala_binary_version}</scala.binary.version>
<kryo.version>5.5.0</kryo.version>
</properties> </properties>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>com.esotericsoftware</groupId> <groupId>com.esotericsoftware</groupId>
<artifactId>kryo</artifactId> <artifactId>kryo</artifactId>
<version>4.0.2</version> <version>${{kryo.version}}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.scala-lang</groupId> <groupId>org.scala-lang</groupId>
@ -48,29 +51,12 @@ pom_template = """
<artifactId>commons-logging</artifactId> <artifactId>commons-logging</artifactId>
<version>1.2</version> <version>1.2</version>
</dependency> </dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
<version>2.6.20</version>
<scope>test</scope>
</dependency>
<dependency> <dependency>
<groupId>org.scalatest</groupId> <groupId>org.scalatest</groupId>
<artifactId>scalatest_${{scala.binary.version}}</artifactId> <artifactId>scalatest_${{scala.binary.version}}</artifactId>
<version>3.0.8</version> <version>${{scalatest.version}}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.scalactic</groupId>
<artifactId>scalactic_${{scala.binary.version}}</artifactId>
<version>3.2.15</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_${{scala.binary.version}}</artifactId> <artifactId>spark-core_${{scala.binary.version}}</artifactId>
@ -92,7 +78,7 @@ pom_template = """
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<version>4.13.2</version> <version>${{junit.version}}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
@ -122,36 +108,9 @@ pom_template = """
<build> <build>
<plugins> <plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId> <artifactId>maven-assembly-plugin</artifactId>
<version>2.4</version>
<configuration> <configuration>
<descriptorRefs> <descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef> <descriptorRef>jar-with-dependencies</descriptorRef>
@ -171,22 +130,12 @@ pom_template = """
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId> <artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
<configuration> <configuration>
<dependenciesToScan> <dependenciesToScan>
<dependency>ml.dmlc:xgboost4j_2.12</dependency> <dependency>ml.dmlc:xgboost4j_${{scala.binary.version}}</dependency>
</dependenciesToScan> </dependenciesToScan>
</configuration> </configuration>
</plugin> </plugin>

View File

@ -1,20 +0,0 @@
package ml.dmlc.xgboost4j.tester;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
/**
* Unit test for simple App.
*/
public class AppTest
{
/**
* Rigorous Test :-)
*/
@Test
public void shouldAnswerWithTrue()
{
assertTrue( true );
}
}

View File

@ -5,10 +5,11 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
</parent> </parent>
<artifactId>xgboost4j_2.12</artifactId> <name>xgboost4j</name>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>2.0.0-SNAPSHOT</version> <version>2.0.0-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>
@ -28,13 +29,13 @@
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<version>4.13.2</version> <version>${junit.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.scalatest</groupId> <groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId> <artifactId>scalatest_${scala.binary.version}</artifactId>
<version>3.2.16</version> <version>${scalatest.version}</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@ -37,7 +37,7 @@ trait EvalTrait extends IEvaluation {
*/ */
def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float
private[scala] def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = { def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = {
require(predicts.length == jdmat.getLabel.length, "predicts size and label size must match " + require(predicts.length == jdmat.getLabel.length, "predicts size and label size must match " +
s" predicts size: ${predicts.length}, label size: ${jdmat.getLabel.length}") s" predicts size: ${predicts.length}, label size: ${jdmat.getLabel.length}")
eval(predicts, new DMatrix(jdmat)) eval(predicts, new DMatrix(jdmat))

View File

@ -31,7 +31,7 @@ trait ObjectiveTrait extends IObjective {
*/ */
def getGradient(predicts: Array[Array[Float]], dtrain: DMatrix): List[Array[Float]] def getGradient(predicts: Array[Array[Float]], dtrain: DMatrix): List[Array[Float]]
private[scala] def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix): def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix):
java.util.List[Array[Float]] = { java.util.List[Array[Float]] = {
getGradient(predicts, new DMatrix(dtrain)).asJava getGradient(predicts, new DMatrix(dtrain)).asJava
} }

View File

@ -17,12 +17,11 @@
package ml.dmlc.xgboost4j.scala package ml.dmlc.xgboost4j.scala
import java.io.InputStream import java.io.InputStream
import ml.dmlc.xgboost4j.java.{XGBoostError, XGBoost => JXGBoost}
import ml.dmlc.xgboost4j.java.{XGBoostError, Booster => JBooster, XGBoost => JXGBoost} import scala.jdk.CollectionConverters._
import scala.collection.JavaConverters._
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.Path
/** /**
* XGBoost Scala Training function. * XGBoost Scala Training function.
@ -40,7 +39,12 @@ object XGBoost {
earlyStoppingRound: Int = 0, earlyStoppingRound: Int = 0,
prevBooster: Booster, prevBooster: Booster,
checkpointParams: Option[ExternalCheckpointParams]): Booster = { checkpointParams: Option[ExternalCheckpointParams]): Booster = {
val jWatches = watches.mapValues(_.jDMatrix).asJava
// we have to filter null value for customized obj and eval
val jParams: java.util.Map[String, AnyRef] =
params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).toMap.asJava
val jWatches = watches.mapValues(_.jDMatrix).toMap.asJava
val jBooster = if (prevBooster == null) { val jBooster = if (prevBooster == null) {
null null
} else { } else {
@ -51,8 +55,7 @@ object XGBoost {
map(cp => { map(cp => {
JXGBoost.trainAndSaveCheckpoint( JXGBoost.trainAndSaveCheckpoint(
dtrain.jDMatrix, dtrain.jDMatrix,
// we have to filter null value for customized obj and eval jParams,
params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava,
numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster, numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster,
cp.checkpointInterval, cp.checkpointInterval,
cp.checkpointPath, cp.checkpointPath,
@ -61,8 +64,7 @@ object XGBoost {
getOrElse( getOrElse(
JXGBoost.train( JXGBoost.train(
dtrain.jDMatrix, dtrain.jDMatrix,
// we have to filter null value for customized obj and eval jParams,
params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava,
numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster) numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster)
) )
if (prevBooster == null) { if (prevBooster == null) {

View File

@ -4,11 +4,18 @@ set -euo pipefail
source tests/buildkite/conftest.sh source tests/buildkite/conftest.sh
echo "--- Build XGBoost JVM packages" echo "--- Build XGBoost JVM packages scala 2.12"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \ tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} ${SPARK_VERSION}
echo "--- Build XGBoost JVM packages scala 2.13"
tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} "" "" "true"
echo "--- Stash XGBoost4J JARs" echo "--- Stash XGBoost4J JARs"
buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"

View File

@ -25,7 +25,7 @@ set -x
CUDA_VERSION=11.8.0 CUDA_VERSION=11.8.0
NCCL_VERSION=2.16.5-1 NCCL_VERSION=2.16.5-1
RAPIDS_VERSION=23.02 RAPIDS_VERSION=23.02
SPARK_VERSION=3.1.1 SPARK_VERSION=3.4.0
JDK_VERSION=8 JDK_VERSION=8
if [[ -z ${BUILDKITE:-} ]] if [[ -z ${BUILDKITE:-} ]]

View File

@ -20,10 +20,14 @@ RUN \
wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \ wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \ tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
ln -s /opt/apache-maven-3.6.1/ /opt/maven && \ ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
# Spark # Spark with scala 2.12
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \ mkdir -p /opt/spark-scala-2.12 && \
tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \ wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \
# Spark with scala 2.13
mkdir -p /opt/spark-scala-2.13 && \
wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \
tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13
ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH

View File

@ -6,6 +6,7 @@ set -x
spark_version=$1 spark_version=$1
use_cuda=$2 use_cuda=$2
gpu_arch=$3 gpu_arch=$3
use_scala213=$4
gpu_options="" gpu_options=""
if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
@ -22,7 +23,13 @@ export RABIT_MOCK=ON
if [ "x$gpu_arch" != "x" ]; then if [ "x$gpu_arch" != "x" ]; then
export GPU_ARCH_FLAG=$gpu_arch export GPU_ARCH_FLAG=$gpu_arch
fi fi
mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
mvn_profile_string=""
if [ "x$use_scala213" != "x" ]; then
export mvn_profile_string="-Pdefault,scala-2.13"
fi
mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
set +x set +x
set +e set +e

View File

@ -6,37 +6,56 @@ set -x
# Initialize local Maven repository # Initialize local Maven repository
./tests/ci_build/initialize_maven.sh ./tests/ci_build/initialize_maven.sh
# Get version number of XGBoost4J and other auxiliary information
cd jvm-packages cd jvm-packages
jvm_packages_dir=`pwd`
# Get version number of XGBoost4J and other auxiliary information
xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout) maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout)
maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout) maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout)
spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout) spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout)
scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
# Install XGBoost4J JAR into local Maven repository if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar cd $jvm_packages_dir/xgboost4j-tester
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
cd xgboost4j-tester
# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
# Run unit tests with XGBoost4J
mvn --no-transfer-progress package
# Run integration tests with XGBoost4J
java -jar ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar
# Run integration tests with XGBoost4J-Spark
if [ ! -z "$RUN_INTEGRATION_TEST" ]
then
python3 get_iris.py python3 get_iris.py
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv cd $jvm_packages_dir
spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
fi fi
# including maven profiles for different scala versions: 2.12 is the default at the moment.
for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
# Install XGBoost4J JAR into local Maven repository
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
cd xgboost4j-tester
# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
# Build package and unit tests with XGBoost4J
mvn --no-transfer-progress clean package
xgboost4j_tester_jar="$jvm_packages_dir/xgboost4j-tester/target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar"
# Run integration tests with XGBoost4J
java -jar $xgboost4j_tester_jar
# Run integration tests with XGBoost4J-Spark
if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
# Changing directory so that we do not mix code and resulting files
cd target
if [[ "$scala_binary_version" == "2.12" ]]; then
/opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
/opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
elif [[ "$scala_binary_version" == "2.13" ]]; then
/opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
/opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
else
echo "Unexpected scala version: $scala_version ($scala_binary_version)."
fi
fi
cd $jvm_packages_dir
done
set +x set +x
set +e set +e