[jvm-packages] Add Rapids plugin support (#7491)
* Add GPU pre-processing pipeline.
This commit is contained in:
@@ -1 +0,0 @@
|
||||
../../xgboost4j-spark/src/test
|
||||
1
jvm-packages/xgboost4j-spark-gpu/src/test/resources
Symbolic link
1
jvm-packages/xgboost4j-spark-gpu/src/test/resources
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../xgboost4j-spark/src/test/resources
|
||||
@@ -0,0 +1,293 @@
|
||||
/*
|
||||
Copyright (c) 2021 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.rapids.spark
|
||||
|
||||
import java.nio.file.{Files, Path}
|
||||
import java.sql.{Date, Timestamp}
|
||||
import java.util.{Locale, TimeZone}
|
||||
|
||||
import com.nvidia.spark.rapids.RapidsConf
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.network.util.JavaUtils
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
import org.apache.spark.sql.rapids.execution.TrampolineUtil
|
||||
|
||||
trait GpuTestSuite extends FunSuite with TmpFolderSuite {
|
||||
import SparkSessionHolder.withSparkSession
|
||||
|
||||
protected def getResourcePath(resource: String): String = {
|
||||
require(resource.startsWith("/"), "resource must start with /")
|
||||
getClass.getResource(resource).getPath
|
||||
}
|
||||
|
||||
def enableCsvConf(): SparkConf = {
|
||||
new SparkConf()
|
||||
.set(RapidsConf.ENABLE_READ_CSV_DATES.key, "true")
|
||||
.set(RapidsConf.ENABLE_READ_CSV_BYTES.key, "true")
|
||||
.set(RapidsConf.ENABLE_READ_CSV_SHORTS.key, "true")
|
||||
.set(RapidsConf.ENABLE_READ_CSV_INTEGERS.key, "true")
|
||||
.set(RapidsConf.ENABLE_READ_CSV_LONGS.key, "true")
|
||||
.set(RapidsConf.ENABLE_READ_CSV_FLOATS.key, "true")
|
||||
.set(RapidsConf.ENABLE_READ_CSV_DOUBLES.key, "true")
|
||||
}
|
||||
|
||||
def withGpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
|
||||
// set "spark.rapids.sql.explain" to "ALL" to check if the operators
|
||||
// can be replaced by GPU
|
||||
val c = conf.clone()
|
||||
.set("spark.rapids.sql.enabled", "true")
|
||||
withSparkSession(c, f)
|
||||
}
|
||||
|
||||
def withCpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
|
||||
val c = conf.clone()
|
||||
.set("spark.rapids.sql.enabled", "false") // Just to be sure
|
||||
withSparkSession(c, f)
|
||||
}
|
||||
|
||||
def compareResults(
|
||||
sort: Boolean,
|
||||
floatEpsilon: Double,
|
||||
fromLeft: Array[Row],
|
||||
fromRight: Array[Row]): Boolean = {
|
||||
if (sort) {
|
||||
val left = fromLeft.map(_.toSeq).sortWith(seqLt)
|
||||
val right = fromRight.map(_.toSeq).sortWith(seqLt)
|
||||
compare(left, right, floatEpsilon)
|
||||
} else {
|
||||
compare(fromLeft, fromRight, floatEpsilon)
|
||||
}
|
||||
}
|
||||
|
||||
// we guarantee that the types will be the same
|
||||
private def seqLt(a: Seq[Any], b: Seq[Any]): Boolean = {
|
||||
if (a.length < b.length) {
|
||||
return true
|
||||
}
|
||||
// lengths are the same
|
||||
for (i <- a.indices) {
|
||||
val v1 = a(i)
|
||||
val v2 = b(i)
|
||||
if (v1 != v2) {
|
||||
// null is always < anything but null
|
||||
if (v1 == null) {
|
||||
return true
|
||||
}
|
||||
|
||||
if (v2 == null) {
|
||||
return false
|
||||
}
|
||||
|
||||
(v1, v2) match {
|
||||
case (i1: Int, i2: Int) => if (i1 < i2) {
|
||||
return true
|
||||
} else if (i1 > i2) {
|
||||
return false
|
||||
}// else equal go on
|
||||
case (i1: Long, i2: Long) => if (i1 < i2) {
|
||||
return true
|
||||
} else if (i1 > i2) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (i1: Float, i2: Float) => if (i1.isNaN() && !i2.isNaN()) return false
|
||||
else if (!i1.isNaN() && i2.isNaN()) return true
|
||||
else if (i1 < i2) {
|
||||
return true
|
||||
} else if (i1 > i2) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (i1: Date, i2: Date) => if (i1.before(i2)) {
|
||||
return true
|
||||
} else if (i1.after(i2)) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (i1: Double, i2: Double) => if (i1.isNaN() && !i2.isNaN()) return false
|
||||
else if (!i1.isNaN() && i2.isNaN()) return true
|
||||
else if (i1 < i2) {
|
||||
return true
|
||||
} else if (i1 > i2) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (i1: Short, i2: Short) => if (i1 < i2) {
|
||||
return true
|
||||
} else if (i1 > i2) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (i1: Timestamp, i2: Timestamp) => if (i1.before(i2)) {
|
||||
return true
|
||||
} else if (i1.after(i2)) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (s1: String, s2: String) =>
|
||||
val cmp = s1.compareTo(s2)
|
||||
if (cmp < 0) {
|
||||
return true
|
||||
} else if (cmp > 0) {
|
||||
return false
|
||||
} // else equal go on
|
||||
case (o1, _) =>
|
||||
throw new UnsupportedOperationException(o1.getClass + " is not supported yet")
|
||||
}
|
||||
}
|
||||
}
|
||||
// They are equal...
|
||||
false
|
||||
}
|
||||
|
||||
private def compare(expected: Any, actual: Any, epsilon: Double = 0.0): Boolean = {
|
||||
def doublesAreEqualWithinPercentage(expected: Double, actual: Double): (String, Boolean) = {
|
||||
if (!compare(expected, actual)) {
|
||||
if (expected != 0) {
|
||||
val v = Math.abs((expected - actual) / expected)
|
||||
(s"\n\nABS($expected - $actual) / ABS($actual) == $v is not <= $epsilon ", v <= epsilon)
|
||||
} else {
|
||||
val v = Math.abs(expected - actual)
|
||||
(s"\n\nABS($expected - $actual) == $v is not <= $epsilon ", v <= epsilon)
|
||||
}
|
||||
} else {
|
||||
("SUCCESS", true)
|
||||
}
|
||||
}
|
||||
(expected, actual) match {
|
||||
case (a: Float, b: Float) if a.isNaN && b.isNaN => true
|
||||
case (a: Double, b: Double) if a.isNaN && b.isNaN => true
|
||||
case (null, null) => true
|
||||
case (null, _) => false
|
||||
case (_, null) => false
|
||||
case (a: Array[_], b: Array[_]) =>
|
||||
a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
|
||||
case (a: Map[_, _], b: Map[_, _]) =>
|
||||
a.size == b.size && a.keys.forall { aKey =>
|
||||
b.keys.find(bKey => compare(aKey, bKey))
|
||||
.exists(bKey => compare(a(aKey), b(bKey), epsilon))
|
||||
}
|
||||
case (a: Iterable[_], b: Iterable[_]) =>
|
||||
a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
|
||||
case (a: Product, b: Product) =>
|
||||
compare(a.productIterator.toSeq, b.productIterator.toSeq, epsilon)
|
||||
case (a: Row, b: Row) =>
|
||||
compare(a.toSeq, b.toSeq, epsilon)
|
||||
// 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0.
|
||||
case (a: Double, b: Double) if epsilon <= 0 =>
|
||||
java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b)
|
||||
case (a: Double, b: Double) if epsilon > 0 =>
|
||||
val ret = doublesAreEqualWithinPercentage(a, b)
|
||||
if (!ret._2) {
|
||||
System.err.println(ret._1 + " (double)")
|
||||
}
|
||||
ret._2
|
||||
case (a: Float, b: Float) if epsilon <= 0 =>
|
||||
java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b)
|
||||
case (a: Float, b: Float) if epsilon > 0 =>
|
||||
val ret = doublesAreEqualWithinPercentage(a, b)
|
||||
if (!ret._2) {
|
||||
System.err.println(ret._1 + " (float)")
|
||||
}
|
||||
ret._2
|
||||
case (a, b) => a == b
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
trait TmpFolderSuite extends BeforeAndAfterAll { self: FunSuite =>
|
||||
protected var tempDir: Path = _
|
||||
|
||||
override def beforeAll(): Unit = {
|
||||
super.beforeAll()
|
||||
tempDir = Files.createTempDirectory(getClass.getName)
|
||||
}
|
||||
|
||||
override def afterAll(): Unit = {
|
||||
JavaUtils.deleteRecursively(tempDir.toFile)
|
||||
super.afterAll()
|
||||
}
|
||||
|
||||
protected def createTmpFolder(prefix: String): Path = {
|
||||
Files.createTempDirectory(tempDir, prefix)
|
||||
}
|
||||
}
|
||||
|
||||
object SparkSessionHolder extends Logging {
|
||||
|
||||
private var spark = createSparkSession()
|
||||
private var origConf = spark.conf.getAll
|
||||
private var origConfKeys = origConf.keys.toSet
|
||||
|
||||
private def setAllConfs(confs: Array[(String, String)]): Unit = confs.foreach {
|
||||
case (key, value) if spark.conf.get(key, null) != value =>
|
||||
spark.conf.set(key, value)
|
||||
case _ => // No need to modify it
|
||||
}
|
||||
|
||||
private def createSparkSession(): SparkSession = {
|
||||
TrampolineUtil.cleanupAnyExistingSession()
|
||||
|
||||
// Timezone is fixed to UTC to allow timestamps to work by default
|
||||
TimeZone.setDefault(TimeZone.getTimeZone("UTC"))
|
||||
// Add Locale setting
|
||||
Locale.setDefault(Locale.US)
|
||||
|
||||
val builder = SparkSession.builder()
|
||||
.master("local[1]")
|
||||
.config("spark.sql.adaptive.enabled", "false")
|
||||
.config("spark.rapids.sql.enabled", "false")
|
||||
.config("spark.rapids.sql.test.enabled", "false")
|
||||
.config("spark.plugins", "com.nvidia.spark.SQLPlugin")
|
||||
.config("spark.rapids.memory.gpu.pooling.enabled", "false") // Disable RMM for unit tests.
|
||||
.appName("XGBoost4j-Spark-Gpu unit test")
|
||||
|
||||
builder.getOrCreate()
|
||||
}
|
||||
|
||||
private def reinitSession(): Unit = {
|
||||
spark = createSparkSession()
|
||||
origConf = spark.conf.getAll
|
||||
origConfKeys = origConf.keys.toSet
|
||||
}
|
||||
|
||||
def sparkSession: SparkSession = {
|
||||
if (SparkSession.getActiveSession.isEmpty) {
|
||||
reinitSession()
|
||||
}
|
||||
spark
|
||||
}
|
||||
|
||||
def resetSparkSessionConf(): Unit = {
|
||||
if (SparkSession.getActiveSession.isEmpty) {
|
||||
reinitSession()
|
||||
} else {
|
||||
setAllConfs(origConf.toArray)
|
||||
val currentKeys = spark.conf.getAll.keys.toSet
|
||||
val toRemove = currentKeys -- origConfKeys
|
||||
toRemove.foreach(spark.conf.unset)
|
||||
}
|
||||
logDebug(s"RESET CONF TO: ${spark.conf.getAll}")
|
||||
}
|
||||
|
||||
def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = {
|
||||
resetSparkSessionConf
|
||||
logDebug(s"SETTING CONF: ${conf.getAll.toMap}")
|
||||
setAllConfs(conf.getAll)
|
||||
logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n")
|
||||
f(spark)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
/*
|
||||
Copyright (c) 2021 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.rapids.spark
|
||||
|
||||
import java.io.File
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
|
||||
|
||||
import org.apache.spark.ml.feature.VectorAssembler
|
||||
import org.apache.spark.sql.functions.{col, udf}
|
||||
import org.apache.spark.sql.types.{FloatType, StructField, StructType}
|
||||
|
||||
class GpuXGBoostClassifierSuite extends GpuTestSuite {
|
||||
private val dataPath = if (new java.io.File("../../demo/data/veterans_lung_cancer.csv").isFile) {
|
||||
"../../demo/data/veterans_lung_cancer.csv"
|
||||
} else {
|
||||
"../demo/data/veterans_lung_cancer.csv"
|
||||
}
|
||||
|
||||
val labelName = "label_col"
|
||||
val schema = StructType(Seq(
|
||||
StructField("f1", FloatType), StructField("f2", FloatType), StructField("f3", FloatType),
|
||||
StructField("f4", FloatType), StructField("f5", FloatType), StructField("f6", FloatType),
|
||||
StructField("f7", FloatType), StructField("f8", FloatType), StructField("f9", FloatType),
|
||||
StructField("f10", FloatType), StructField("f11", FloatType), StructField("f12", FloatType),
|
||||
StructField(labelName, FloatType)
|
||||
))
|
||||
val featureNames = schema.fieldNames.filter(s => !s.equals(labelName)).toSeq
|
||||
|
||||
test("The transform result should be same for several runs on same model") {
|
||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
|
||||
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
// Get a model
|
||||
val model = new XGBoostClassifier(xgbParam)
|
||||
.fit(originalDf)
|
||||
val left = model.transform(testDf).collect()
|
||||
val right = model.transform(testDf).collect()
|
||||
// The left should be same with right
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("use weight") {
|
||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
|
||||
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
|
||||
val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
|
||||
|
||||
val model = new XGBoostClassifier(xgbParam)
|
||||
.fit(originalDf)
|
||||
val model2 = new XGBoostClassifier(xgbParam)
|
||||
.setWeightCol("weight")
|
||||
.fit(dfWithWeight)
|
||||
|
||||
val left = model.transform(testDf).collect()
|
||||
val right = model2.transform(testDf).collect()
|
||||
// left should be different with right
|
||||
assert(!compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Save model and transform GPU dataset") {
|
||||
// Train a model on GPU
|
||||
val (gpuModel, testDf) = withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
|
||||
"num_round" -> 10, "num_workers" -> 1)
|
||||
val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val classifier = new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.setTreeMethod("gpu_hist")
|
||||
(classifier.fit(rawInput), testDf)
|
||||
}
|
||||
|
||||
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
|
||||
gpuModel.write.overwrite().save(xgbrModel)
|
||||
val gpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
|
||||
|
||||
// transform on GPU
|
||||
withGpuSparkSession() { spark =>
|
||||
val left = gpuModel
|
||||
.transform(testDf)
|
||||
.select(labelName, "rawPrediction", "probability", "prediction")
|
||||
.collect()
|
||||
|
||||
val right = gpuModelFromFile
|
||||
.transform(testDf)
|
||||
.select(labelName, "rawPrediction", "probability", "prediction")
|
||||
.collect()
|
||||
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Model trained on CPU can transform GPU dataset") {
|
||||
// Train a model on CPU
|
||||
val cpuModel = withCpuSparkSession() { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
|
||||
"num_round" -> 10, "num_workers" -> 1)
|
||||
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setHandleInvalid("keep")
|
||||
.setInputCols(featureNames.toArray)
|
||||
.setOutputCol("features")
|
||||
val trainingDf = vectorAssembler.transform(rawInput).select("features", labelName)
|
||||
|
||||
val classifier = new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCol("features")
|
||||
.setLabelCol(labelName)
|
||||
.setTreeMethod("auto")
|
||||
classifier.fit(trainingDf)
|
||||
}
|
||||
|
||||
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
|
||||
cpuModel.write.overwrite().save(xgbrModel)
|
||||
val cpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
|
||||
|
||||
// transform on GPU
|
||||
withGpuSparkSession() { spark =>
|
||||
val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
// Since CPU model does not know the information about the features cols that GPU transform
|
||||
// pipeline requires. End user needs to setFeaturesCols in the model manually
|
||||
val thrown = intercept[IllegalArgumentException](cpuModel
|
||||
.transform(testDf)
|
||||
.collect())
|
||||
assert(thrown.getMessage.contains("Gpu transform requires features columns. " +
|
||||
"please refer to setFeaturesCols"))
|
||||
|
||||
val left = cpuModel
|
||||
.setFeaturesCols(featureNames)
|
||||
.transform(testDf)
|
||||
.collect()
|
||||
|
||||
val right = cpuModelFromFile
|
||||
.setFeaturesCols(featureNames)
|
||||
.transform(testDf)
|
||||
.collect()
|
||||
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Model trained on GPU can transform CPU dataset") {
|
||||
// Train a model on GPU
|
||||
val gpuModel = withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
|
||||
"num_round" -> 10, "num_workers" -> 1)
|
||||
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val classifier = new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.setTreeMethod("gpu_hist")
|
||||
classifier.fit(rawInput)
|
||||
}
|
||||
|
||||
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
|
||||
gpuModel.write.overwrite().save(xgbrModel)
|
||||
val gpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
|
||||
|
||||
// transform on CPU
|
||||
withCpuSparkSession() { spark =>
|
||||
val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val featureColName = "feature_col"
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setHandleInvalid("keep")
|
||||
.setInputCols(featureNames.toArray)
|
||||
.setOutputCol(featureColName)
|
||||
val testDf = vectorAssembler.transform(rawInput).select(featureColName, labelName)
|
||||
|
||||
// Since GPU model does not know the information about the features col name that CPU
|
||||
// transform pipeline requires. End user needs to setFeaturesCol in the model manually
|
||||
val thrown = intercept[IllegalArgumentException](
|
||||
gpuModel
|
||||
.transform(testDf)
|
||||
.collect())
|
||||
assert(thrown.getMessage.contains("features does not exist"))
|
||||
|
||||
val left = gpuModel
|
||||
.setFeaturesCol(featureColName)
|
||||
.transform(testDf)
|
||||
.select(labelName, "rawPrediction", "probability", "prediction")
|
||||
.collect()
|
||||
|
||||
val right = gpuModelFromFile
|
||||
.setFeaturesCol(featureColName)
|
||||
.transform(testDf)
|
||||
.select(labelName, "rawPrediction", "probability", "prediction")
|
||||
.collect()
|
||||
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,182 @@
|
||||
/*
|
||||
Copyright (c) 2021 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.rapids.spark
|
||||
|
||||
import java.io.File
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier}
|
||||
|
||||
import org.apache.spark.sql.functions.col
|
||||
import org.apache.spark.sql.types.StringType
|
||||
|
||||
class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
||||
|
||||
private val labelName = "label_col"
|
||||
private val weightName = "weight_col"
|
||||
private val baseMarginName = "margin_col"
|
||||
private val featureNames = Seq("f1", "f2", "f3")
|
||||
private val allColumnNames = featureNames :+ weightName :+ baseMarginName :+ labelName
|
||||
private val trainingData = Seq(
|
||||
// f1, f2, f3, weight, margin, label
|
||||
(1.0f, 2.0f, 3.0f, 1.0f, 0.5f, 0),
|
||||
(2.0f, 3.0f, 4.0f, 2.0f, 0.6f, 0),
|
||||
(1.2f, 2.1f, 3.1f, 1.1f, 0.51f, 0),
|
||||
(2.3f, 3.1f, 4.1f, 2.1f, 0.61f, 0),
|
||||
(3.0f, 4.0f, 5.0f, 1.5f, 0.3f, 1),
|
||||
(4.0f, 5.0f, 6.0f, 2.5f, 0.4f, 1),
|
||||
(3.1f, 4.1f, 5.1f, 1.6f, 0.4f, 1),
|
||||
(4.1f, 5.1f, 6.1f, 2.6f, 0.5f, 1),
|
||||
(5.0f, 6.0f, 7.0f, 1.0f, 0.2f, 2),
|
||||
(6.0f, 7.0f, 8.0f, 1.3f, 0.6f, 2),
|
||||
(5.1f, 6.1f, 7.1f, 1.2f, 0.1f, 2),
|
||||
(6.1f, 7.1f, 8.1f, 1.4f, 0.7f, 2),
|
||||
(6.2f, 7.2f, 8.2f, 1.5f, 0.8f, 2))
|
||||
|
||||
test("MLlib way setting features_cols should work") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val trainingDf = trainingData.toDF(allColumnNames: _*)
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
}
|
||||
|
||||
test("disorder feature columns should work") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
var trainingDf = trainingData.toDF(allColumnNames: _*)
|
||||
|
||||
trainingDf = trainingDf.select(labelName, "f2", weightName, "f3", baseMarginName, "f1")
|
||||
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
}
|
||||
|
||||
test("Throw exception when feature/label columns are not numeric type") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val originalDf = trainingData.toDF(allColumnNames: _*)
|
||||
var trainingDf = originalDf.withColumn("f2", col("f2").cast(StringType))
|
||||
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||
val thrown1 = intercept[IllegalArgumentException] {
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
assert(thrown1.getMessage.contains("Column f2 must be of NumericType but found: string."))
|
||||
|
||||
trainingDf = originalDf.withColumn(labelName, col(labelName).cast(StringType))
|
||||
val thrown2 = intercept[IllegalArgumentException] {
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
assert(thrown2.getMessage.contains(
|
||||
s"Column $labelName must be of NumericType but found: string."))
|
||||
}
|
||||
}
|
||||
|
||||
test("Throw exception when features_cols or label_col is not set") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val trainingDf = trainingData.toDF(allColumnNames: _*)
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||
val thrown = intercept[IllegalArgumentException] {
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.setLabelCol(labelName)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
assert(thrown.getMessage.contains("Gpu train requires features columns."))
|
||||
|
||||
val thrown1 = intercept[IllegalArgumentException] {
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
assert(thrown1.getMessage.contains("label does not exist."))
|
||||
}
|
||||
}
|
||||
|
||||
test("Throw exception when tree method is not set to gpu_hist") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val trainingDf = trainingData.toDF(allColumnNames: _*)
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "hist")
|
||||
val thrown = intercept[IllegalArgumentException] {
|
||||
new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.fit(trainingDf)
|
||||
}
|
||||
assert(thrown.getMessage.contains("GPU train requires tree_method set to gpu_hist"))
|
||||
}
|
||||
}
|
||||
|
||||
test("Train with eval") {
|
||||
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val Array(trainingDf, eval1, eval2) = trainingData.toDF(allColumnNames: _*)
|
||||
.randomSplit(Array(0.6, 0.2, 0.2), seed = 1)
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||
val model1 = new XGBoostClassifier(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
|
||||
.fit(trainingDf)
|
||||
|
||||
assert(model1.summary.validationObjectiveHistory.length === 2)
|
||||
assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
|
||||
assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
|
||||
assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
|
||||
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
|
||||
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
|
||||
val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
val xgbc = new XGBoostClassifier(xgbParam)
|
||||
xgbc.write.overwrite().save(xgbcPath)
|
||||
val paramMap2 = XGBoostClassifier.load(xgbcPath).MLlib2XGBoostParams
|
||||
xgbParam.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,239 @@
|
||||
/*
|
||||
Copyright (c) 2021 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.rapids.spark
|
||||
|
||||
import java.io.File
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
|
||||
|
||||
import org.apache.spark.ml.feature.VectorAssembler
|
||||
import org.apache.spark.sql.functions.{col, udf}
|
||||
import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
|
||||
|
||||
class GpuXGBoostRegressorSuite extends GpuTestSuite {
|
||||
|
||||
val labelName = "label_col"
|
||||
val groupName = "group_col"
|
||||
val schema = StructType(Seq(
|
||||
StructField(labelName, FloatType),
|
||||
StructField("f1", FloatType),
|
||||
StructField("f2", FloatType),
|
||||
StructField("f3", FloatType),
|
||||
StructField(groupName, IntegerType)))
|
||||
val featureNames = schema.fieldNames.filter(s =>
|
||||
!(s.equals(labelName) || s.equals(groupName))).toSeq
|
||||
|
||||
test("The transform result should be same for several runs on same model") {
|
||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
|
||||
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
// Get a model
|
||||
val model = new XGBoostRegressor(xgbParam)
|
||||
.fit(originalDf)
|
||||
val left = model.transform(testDf).collect()
|
||||
val right = model.transform(testDf).collect()
|
||||
// The left should be same with right
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("use weight") {
|
||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
|
||||
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
|
||||
val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
|
||||
|
||||
val model = new XGBoostRegressor(xgbParam)
|
||||
.fit(originalDf)
|
||||
val model2 = new XGBoostRegressor(xgbParam)
|
||||
.setWeightCol("weight")
|
||||
.fit(dfWithWeight)
|
||||
|
||||
val left = model.transform(testDf).collect()
|
||||
val right = model2.transform(testDf).collect()
|
||||
// left should be different with right
|
||||
assert(!compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Save model and transform GPU dataset") {
|
||||
// Train a model on GPU
|
||||
val (gpuModel, testDf) = withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
|
||||
"num_round" -> 10, "num_workers" -> 1)
|
||||
val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val classifier = new XGBoostRegressor(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.setTreeMethod("gpu_hist")
|
||||
(classifier.fit(rawInput), testDf)
|
||||
}
|
||||
|
||||
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
|
||||
gpuModel.write.overwrite().save(xgbrModel)
|
||||
val gpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
|
||||
|
||||
// transform on GPU
|
||||
withGpuSparkSession() { spark =>
|
||||
val left = gpuModel
|
||||
.transform(testDf)
|
||||
.select(labelName, "prediction")
|
||||
.collect()
|
||||
|
||||
val right = gpuModelFromFile
|
||||
.transform(testDf)
|
||||
.select(labelName, "prediction")
|
||||
.collect()
|
||||
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Model trained on CPU can transform GPU dataset") {
|
||||
// Train a model on CPU
|
||||
val cpuModel = withCpuSparkSession() { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
|
||||
"num_round" -> 10, "num_workers" -> 1)
|
||||
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setHandleInvalid("keep")
|
||||
.setInputCols(featureNames.toArray)
|
||||
.setOutputCol("features")
|
||||
val trainingDf = vectorAssembler.transform(rawInput).select("features", labelName)
|
||||
|
||||
val classifier = new XGBoostRegressor(xgbParam)
|
||||
.setFeaturesCol("features")
|
||||
.setLabelCol(labelName)
|
||||
.setTreeMethod("auto")
|
||||
classifier.fit(trainingDf)
|
||||
}
|
||||
|
||||
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
|
||||
cpuModel.write.overwrite().save(xgbrModel)
|
||||
val cpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
|
||||
|
||||
// transform on GPU
|
||||
withGpuSparkSession() { spark =>
|
||||
val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
// Since CPU model does not know the information about the features cols that GPU transform
|
||||
// pipeline requires. End user needs to setFeaturesCols in the model manually
|
||||
val thrown = intercept[IllegalArgumentException](cpuModel
|
||||
.transform(testDf)
|
||||
.collect())
|
||||
assert(thrown.getMessage.contains("Gpu transform requires features columns. " +
|
||||
"please refer to setFeaturesCols"))
|
||||
|
||||
val left = cpuModel
|
||||
.setFeaturesCols(featureNames)
|
||||
.transform(testDf)
|
||||
.collect()
|
||||
|
||||
val right = cpuModelFromFile
|
||||
.setFeaturesCols(featureNames)
|
||||
.transform(testDf)
|
||||
.collect()
|
||||
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Model trained on GPU can transform CPU dataset") {
|
||||
// Train a model on GPU
|
||||
val gpuModel = withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
|
||||
"num_round" -> 10, "num_workers" -> 1)
|
||||
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val classifier = new XGBoostRegressor(xgbParam)
|
||||
.setFeaturesCols(featureNames)
|
||||
.setLabelCol(labelName)
|
||||
.setTreeMethod("gpu_hist")
|
||||
classifier.fit(rawInput)
|
||||
}
|
||||
|
||||
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
|
||||
gpuModel.write.overwrite().save(xgbrModel)
|
||||
val gpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
|
||||
|
||||
// transform on CPU
|
||||
withCpuSparkSession() { spark =>
|
||||
val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val featureColName = "feature_col"
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setHandleInvalid("keep")
|
||||
.setInputCols(featureNames.toArray)
|
||||
.setOutputCol(featureColName)
|
||||
val testDf = vectorAssembler.transform(rawInput).select(featureColName, labelName)
|
||||
|
||||
// Since GPU model does not know the information about the features col name that CPU
|
||||
// transform pipeline requires. End user needs to setFeaturesCol in the model manually
|
||||
val thrown = intercept[IllegalArgumentException](
|
||||
gpuModel
|
||||
.transform(testDf)
|
||||
.collect())
|
||||
assert(thrown.getMessage.contains("features does not exist"))
|
||||
|
||||
val left = gpuModel
|
||||
.setFeaturesCol(featureColName)
|
||||
.transform(testDf)
|
||||
.select(labelName, "prediction")
|
||||
.collect()
|
||||
|
||||
val right = gpuModelFromFile
|
||||
.setFeaturesCol(featureColName)
|
||||
.transform(testDf)
|
||||
.select(labelName, "prediction")
|
||||
.collect()
|
||||
|
||||
assert(compareResults(true, 0.000001, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
test("Ranking: train with Group") {
|
||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "rank:pairwise",
|
||||
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||
val Array(trainingDf, testDf) = spark.read.option("header", "true").schema(schema)
|
||||
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||
|
||||
val model = new XGBoostRegressor(xgbParam)
|
||||
.setGroupCol(groupName)
|
||||
.fit(trainingDf)
|
||||
|
||||
val ret = model.transform(testDf).collect()
|
||||
assert(testDf.count() === ret.length)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user