[jvm-packages] Add Rapids plugin support (#7491)

* Add GPU pre-processing pipeline.
This commit is contained in:
Bobby Wang
2021-12-17 13:11:12 +08:00
committed by GitHub
parent 5b1161bb64
commit 24e25802a7
24 changed files with 2035 additions and 37 deletions

View File

@@ -1 +0,0 @@
../../xgboost4j-spark/src/test

View File

@@ -0,0 +1 @@
../../../xgboost4j-spark/src/test/resources

View File

@@ -0,0 +1,293 @@
/*
Copyright (c) 2021 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.rapids.spark
import java.nio.file.{Files, Path}
import java.sql.{Date, Timestamp}
import java.util.{Locale, TimeZone}
import com.nvidia.spark.rapids.RapidsConf
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.network.util.JavaUtils
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.rapids.execution.TrampolineUtil
trait GpuTestSuite extends FunSuite with TmpFolderSuite {
import SparkSessionHolder.withSparkSession
protected def getResourcePath(resource: String): String = {
require(resource.startsWith("/"), "resource must start with /")
getClass.getResource(resource).getPath
}
def enableCsvConf(): SparkConf = {
new SparkConf()
.set(RapidsConf.ENABLE_READ_CSV_DATES.key, "true")
.set(RapidsConf.ENABLE_READ_CSV_BYTES.key, "true")
.set(RapidsConf.ENABLE_READ_CSV_SHORTS.key, "true")
.set(RapidsConf.ENABLE_READ_CSV_INTEGERS.key, "true")
.set(RapidsConf.ENABLE_READ_CSV_LONGS.key, "true")
.set(RapidsConf.ENABLE_READ_CSV_FLOATS.key, "true")
.set(RapidsConf.ENABLE_READ_CSV_DOUBLES.key, "true")
}
def withGpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
// set "spark.rapids.sql.explain" to "ALL" to check if the operators
// can be replaced by GPU
val c = conf.clone()
.set("spark.rapids.sql.enabled", "true")
withSparkSession(c, f)
}
def withCpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
val c = conf.clone()
.set("spark.rapids.sql.enabled", "false") // Just to be sure
withSparkSession(c, f)
}
def compareResults(
sort: Boolean,
floatEpsilon: Double,
fromLeft: Array[Row],
fromRight: Array[Row]): Boolean = {
if (sort) {
val left = fromLeft.map(_.toSeq).sortWith(seqLt)
val right = fromRight.map(_.toSeq).sortWith(seqLt)
compare(left, right, floatEpsilon)
} else {
compare(fromLeft, fromRight, floatEpsilon)
}
}
// we guarantee that the types will be the same
private def seqLt(a: Seq[Any], b: Seq[Any]): Boolean = {
if (a.length < b.length) {
return true
}
// lengths are the same
for (i <- a.indices) {
val v1 = a(i)
val v2 = b(i)
if (v1 != v2) {
// null is always < anything but null
if (v1 == null) {
return true
}
if (v2 == null) {
return false
}
(v1, v2) match {
case (i1: Int, i2: Int) => if (i1 < i2) {
return true
} else if (i1 > i2) {
return false
}// else equal go on
case (i1: Long, i2: Long) => if (i1 < i2) {
return true
} else if (i1 > i2) {
return false
} // else equal go on
case (i1: Float, i2: Float) => if (i1.isNaN() && !i2.isNaN()) return false
else if (!i1.isNaN() && i2.isNaN()) return true
else if (i1 < i2) {
return true
} else if (i1 > i2) {
return false
} // else equal go on
case (i1: Date, i2: Date) => if (i1.before(i2)) {
return true
} else if (i1.after(i2)) {
return false
} // else equal go on
case (i1: Double, i2: Double) => if (i1.isNaN() && !i2.isNaN()) return false
else if (!i1.isNaN() && i2.isNaN()) return true
else if (i1 < i2) {
return true
} else if (i1 > i2) {
return false
} // else equal go on
case (i1: Short, i2: Short) => if (i1 < i2) {
return true
} else if (i1 > i2) {
return false
} // else equal go on
case (i1: Timestamp, i2: Timestamp) => if (i1.before(i2)) {
return true
} else if (i1.after(i2)) {
return false
} // else equal go on
case (s1: String, s2: String) =>
val cmp = s1.compareTo(s2)
if (cmp < 0) {
return true
} else if (cmp > 0) {
return false
} // else equal go on
case (o1, _) =>
throw new UnsupportedOperationException(o1.getClass + " is not supported yet")
}
}
}
// They are equal...
false
}
private def compare(expected: Any, actual: Any, epsilon: Double = 0.0): Boolean = {
def doublesAreEqualWithinPercentage(expected: Double, actual: Double): (String, Boolean) = {
if (!compare(expected, actual)) {
if (expected != 0) {
val v = Math.abs((expected - actual) / expected)
(s"\n\nABS($expected - $actual) / ABS($actual) == $v is not <= $epsilon ", v <= epsilon)
} else {
val v = Math.abs(expected - actual)
(s"\n\nABS($expected - $actual) == $v is not <= $epsilon ", v <= epsilon)
}
} else {
("SUCCESS", true)
}
}
(expected, actual) match {
case (a: Float, b: Float) if a.isNaN && b.isNaN => true
case (a: Double, b: Double) if a.isNaN && b.isNaN => true
case (null, null) => true
case (null, _) => false
case (_, null) => false
case (a: Array[_], b: Array[_]) =>
a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
case (a: Map[_, _], b: Map[_, _]) =>
a.size == b.size && a.keys.forall { aKey =>
b.keys.find(bKey => compare(aKey, bKey))
.exists(bKey => compare(a(aKey), b(bKey), epsilon))
}
case (a: Iterable[_], b: Iterable[_]) =>
a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
case (a: Product, b: Product) =>
compare(a.productIterator.toSeq, b.productIterator.toSeq, epsilon)
case (a: Row, b: Row) =>
compare(a.toSeq, b.toSeq, epsilon)
// 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0.
case (a: Double, b: Double) if epsilon <= 0 =>
java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b)
case (a: Double, b: Double) if epsilon > 0 =>
val ret = doublesAreEqualWithinPercentage(a, b)
if (!ret._2) {
System.err.println(ret._1 + " (double)")
}
ret._2
case (a: Float, b: Float) if epsilon <= 0 =>
java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b)
case (a: Float, b: Float) if epsilon > 0 =>
val ret = doublesAreEqualWithinPercentage(a, b)
if (!ret._2) {
System.err.println(ret._1 + " (float)")
}
ret._2
case (a, b) => a == b
}
}
}
trait TmpFolderSuite extends BeforeAndAfterAll { self: FunSuite =>
protected var tempDir: Path = _
override def beforeAll(): Unit = {
super.beforeAll()
tempDir = Files.createTempDirectory(getClass.getName)
}
override def afterAll(): Unit = {
JavaUtils.deleteRecursively(tempDir.toFile)
super.afterAll()
}
protected def createTmpFolder(prefix: String): Path = {
Files.createTempDirectory(tempDir, prefix)
}
}
object SparkSessionHolder extends Logging {
private var spark = createSparkSession()
private var origConf = spark.conf.getAll
private var origConfKeys = origConf.keys.toSet
private def setAllConfs(confs: Array[(String, String)]): Unit = confs.foreach {
case (key, value) if spark.conf.get(key, null) != value =>
spark.conf.set(key, value)
case _ => // No need to modify it
}
private def createSparkSession(): SparkSession = {
TrampolineUtil.cleanupAnyExistingSession()
// Timezone is fixed to UTC to allow timestamps to work by default
TimeZone.setDefault(TimeZone.getTimeZone("UTC"))
// Add Locale setting
Locale.setDefault(Locale.US)
val builder = SparkSession.builder()
.master("local[1]")
.config("spark.sql.adaptive.enabled", "false")
.config("spark.rapids.sql.enabled", "false")
.config("spark.rapids.sql.test.enabled", "false")
.config("spark.plugins", "com.nvidia.spark.SQLPlugin")
.config("spark.rapids.memory.gpu.pooling.enabled", "false") // Disable RMM for unit tests.
.appName("XGBoost4j-Spark-Gpu unit test")
builder.getOrCreate()
}
private def reinitSession(): Unit = {
spark = createSparkSession()
origConf = spark.conf.getAll
origConfKeys = origConf.keys.toSet
}
def sparkSession: SparkSession = {
if (SparkSession.getActiveSession.isEmpty) {
reinitSession()
}
spark
}
def resetSparkSessionConf(): Unit = {
if (SparkSession.getActiveSession.isEmpty) {
reinitSession()
} else {
setAllConfs(origConf.toArray)
val currentKeys = spark.conf.getAll.keys.toSet
val toRemove = currentKeys -- origConfKeys
toRemove.foreach(spark.conf.unset)
}
logDebug(s"RESET CONF TO: ${spark.conf.getAll}")
}
def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = {
resetSparkSessionConf
logDebug(s"SETTING CONF: ${conf.getAll.toMap}")
setAllConfs(conf.getAll)
logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n")
f(spark)
}
}

View File

@@ -0,0 +1,226 @@
/*
Copyright (c) 2021 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.rapids.spark
import java.io.File
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{FloatType, StructField, StructType}
class GpuXGBoostClassifierSuite extends GpuTestSuite {
private val dataPath = if (new java.io.File("../../demo/data/veterans_lung_cancer.csv").isFile) {
"../../demo/data/veterans_lung_cancer.csv"
} else {
"../demo/data/veterans_lung_cancer.csv"
}
val labelName = "label_col"
val schema = StructType(Seq(
StructField("f1", FloatType), StructField("f2", FloatType), StructField("f3", FloatType),
StructField("f4", FloatType), StructField("f5", FloatType), StructField("f6", FloatType),
StructField("f7", FloatType), StructField("f8", FloatType), StructField("f9", FloatType),
StructField("f10", FloatType), StructField("f11", FloatType), StructField("f12", FloatType),
StructField(labelName, FloatType)
))
val featureNames = schema.fieldNames.filter(s => !s.equals(labelName)).toSeq
test("The transform result should be same for several runs on same model") {
withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
// Get a model
val model = new XGBoostClassifier(xgbParam)
.fit(originalDf)
val left = model.transform(testDf).collect()
val right = model.transform(testDf).collect()
// The left should be same with right
assert(compareResults(true, 0.000001, left, right))
}
}
test("use weight") {
withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
val model = new XGBoostClassifier(xgbParam)
.fit(originalDf)
val model2 = new XGBoostClassifier(xgbParam)
.setWeightCol("weight")
.fit(dfWithWeight)
val left = model.transform(testDf).collect()
val right = model2.transform(testDf).collect()
// left should be different with right
assert(!compareResults(true, 0.000001, left, right))
}
}
test("Save model and transform GPU dataset") {
// Train a model on GPU
val (gpuModel, testDf) = withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
"num_round" -> 10, "num_workers" -> 1)
val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
val classifier = new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.setTreeMethod("gpu_hist")
(classifier.fit(rawInput), testDf)
}
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
gpuModel.write.overwrite().save(xgbrModel)
val gpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
// transform on GPU
withGpuSparkSession() { spark =>
val left = gpuModel
.transform(testDf)
.select(labelName, "rawPrediction", "probability", "prediction")
.collect()
val right = gpuModelFromFile
.transform(testDf)
.select(labelName, "rawPrediction", "probability", "prediction")
.collect()
assert(compareResults(true, 0.000001, left, right))
}
}
test("Model trained on CPU can transform GPU dataset") {
// Train a model on CPU
val cpuModel = withCpuSparkSession() { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
"num_round" -> 10, "num_workers" -> 1)
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
val vectorAssembler = new VectorAssembler()
.setHandleInvalid("keep")
.setInputCols(featureNames.toArray)
.setOutputCol("features")
val trainingDf = vectorAssembler.transform(rawInput).select("features", labelName)
val classifier = new XGBoostClassifier(xgbParam)
.setFeaturesCol("features")
.setLabelCol(labelName)
.setTreeMethod("auto")
classifier.fit(trainingDf)
}
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
cpuModel.write.overwrite().save(xgbrModel)
val cpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
// transform on GPU
withGpuSparkSession() { spark =>
val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
// Since CPU model does not know the information about the features cols that GPU transform
// pipeline requires. End user needs to setFeaturesCols in the model manually
val thrown = intercept[IllegalArgumentException](cpuModel
.transform(testDf)
.collect())
assert(thrown.getMessage.contains("Gpu transform requires features columns. " +
"please refer to setFeaturesCols"))
val left = cpuModel
.setFeaturesCols(featureNames)
.transform(testDf)
.collect()
val right = cpuModelFromFile
.setFeaturesCols(featureNames)
.transform(testDf)
.collect()
assert(compareResults(true, 0.000001, left, right))
}
}
test("Model trained on GPU can transform CPU dataset") {
// Train a model on GPU
val gpuModel = withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
"num_round" -> 10, "num_workers" -> 1)
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
val classifier = new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.setTreeMethod("gpu_hist")
classifier.fit(rawInput)
}
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
gpuModel.write.overwrite().save(xgbrModel)
val gpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
// transform on CPU
withCpuSparkSession() { spark =>
val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
val featureColName = "feature_col"
val vectorAssembler = new VectorAssembler()
.setHandleInvalid("keep")
.setInputCols(featureNames.toArray)
.setOutputCol(featureColName)
val testDf = vectorAssembler.transform(rawInput).select(featureColName, labelName)
// Since GPU model does not know the information about the features col name that CPU
// transform pipeline requires. End user needs to setFeaturesCol in the model manually
val thrown = intercept[IllegalArgumentException](
gpuModel
.transform(testDf)
.collect())
assert(thrown.getMessage.contains("features does not exist"))
val left = gpuModel
.setFeaturesCol(featureColName)
.transform(testDf)
.select(labelName, "rawPrediction", "probability", "prediction")
.collect()
val right = gpuModelFromFile
.setFeaturesCol(featureColName)
.transform(testDf)
.select(labelName, "rawPrediction", "probability", "prediction")
.collect()
assert(compareResults(true, 0.000001, left, right))
}
}
}

View File

@@ -0,0 +1,182 @@
/*
Copyright (c) 2021 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.rapids.spark
import java.io.File
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StringType
class GpuXGBoostGeneralSuite extends GpuTestSuite {
private val labelName = "label_col"
private val weightName = "weight_col"
private val baseMarginName = "margin_col"
private val featureNames = Seq("f1", "f2", "f3")
private val allColumnNames = featureNames :+ weightName :+ baseMarginName :+ labelName
private val trainingData = Seq(
// f1, f2, f3, weight, margin, label
(1.0f, 2.0f, 3.0f, 1.0f, 0.5f, 0),
(2.0f, 3.0f, 4.0f, 2.0f, 0.6f, 0),
(1.2f, 2.1f, 3.1f, 1.1f, 0.51f, 0),
(2.3f, 3.1f, 4.1f, 2.1f, 0.61f, 0),
(3.0f, 4.0f, 5.0f, 1.5f, 0.3f, 1),
(4.0f, 5.0f, 6.0f, 2.5f, 0.4f, 1),
(3.1f, 4.1f, 5.1f, 1.6f, 0.4f, 1),
(4.1f, 5.1f, 6.1f, 2.6f, 0.5f, 1),
(5.0f, 6.0f, 7.0f, 1.0f, 0.2f, 2),
(6.0f, 7.0f, 8.0f, 1.3f, 0.6f, 2),
(5.1f, 6.1f, 7.1f, 1.2f, 0.1f, 2),
(6.1f, 7.1f, 8.1f, 1.4f, 0.7f, 2),
(6.2f, 7.2f, 8.2f, 1.5f, 0.8f, 2))
test("MLlib way setting features_cols should work") {
withGpuSparkSession() { spark =>
import spark.implicits._
val trainingDf = trainingData.toDF(allColumnNames: _*)
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
new XGBoostClassifier(xgbParam)
.fit(trainingDf)
}
}
test("disorder feature columns should work") {
withGpuSparkSession() { spark =>
import spark.implicits._
var trainingDf = trainingData.toDF(allColumnNames: _*)
trainingDf = trainingDf.select(labelName, "f2", weightName, "f3", baseMarginName, "f1")
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.fit(trainingDf)
}
}
test("Throw exception when feature/label columns are not numeric type") {
withGpuSparkSession() { spark =>
import spark.implicits._
val originalDf = trainingData.toDF(allColumnNames: _*)
var trainingDf = originalDf.withColumn("f2", col("f2").cast(StringType))
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
val thrown1 = intercept[IllegalArgumentException] {
new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.fit(trainingDf)
}
assert(thrown1.getMessage.contains("Column f2 must be of NumericType but found: string."))
trainingDf = originalDf.withColumn(labelName, col(labelName).cast(StringType))
val thrown2 = intercept[IllegalArgumentException] {
new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.fit(trainingDf)
}
assert(thrown2.getMessage.contains(
s"Column $labelName must be of NumericType but found: string."))
}
}
test("Throw exception when features_cols or label_col is not set") {
withGpuSparkSession() { spark =>
import spark.implicits._
val trainingDf = trainingData.toDF(allColumnNames: _*)
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
val thrown = intercept[IllegalArgumentException] {
new XGBoostClassifier(xgbParam)
.setLabelCol(labelName)
.fit(trainingDf)
}
assert(thrown.getMessage.contains("Gpu train requires features columns."))
val thrown1 = intercept[IllegalArgumentException] {
new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.fit(trainingDf)
}
assert(thrown1.getMessage.contains("label does not exist."))
}
}
test("Throw exception when tree method is not set to gpu_hist") {
withGpuSparkSession() { spark =>
import spark.implicits._
val trainingDf = trainingData.toDF(allColumnNames: _*)
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "hist")
val thrown = intercept[IllegalArgumentException] {
new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.fit(trainingDf)
}
assert(thrown.getMessage.contains("GPU train requires tree_method set to gpu_hist"))
}
}
test("Train with eval") {
withGpuSparkSession() { spark =>
import spark.implicits._
val Array(trainingDf, eval1, eval2) = trainingData.toDF(allColumnNames: _*)
.randomSplit(Array(0.6, 0.2, 0.2), seed = 1)
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
val model1 = new XGBoostClassifier(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
.fit(trainingDf)
assert(model1.summary.validationObjectiveHistory.length === 2)
assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
}
}
test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
withGpuSparkSession() { spark =>
import spark.implicits._
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
val xgbc = new XGBoostClassifier(xgbParam)
xgbc.write.overwrite().save(xgbcPath)
val paramMap2 = XGBoostClassifier.load(xgbcPath).MLlib2XGBoostParams
xgbParam.foreach {
case (k, v) => assert(v.toString == paramMap2(k).toString)
}
}
}
}

View File

@@ -0,0 +1,239 @@
/*
Copyright (c) 2021 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.rapids.spark
import java.io.File
import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
class GpuXGBoostRegressorSuite extends GpuTestSuite {
val labelName = "label_col"
val groupName = "group_col"
val schema = StructType(Seq(
StructField(labelName, FloatType),
StructField("f1", FloatType),
StructField("f2", FloatType),
StructField("f3", FloatType),
StructField(groupName, IntegerType)))
val featureNames = schema.fieldNames.filter(s =>
!(s.equals(labelName) || s.equals(groupName))).toSeq
test("The transform result should be same for several runs on same model") {
withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
// Get a model
val model = new XGBoostRegressor(xgbParam)
.fit(originalDf)
val left = model.transform(testDf).collect()
val right = model.transform(testDf).collect()
// The left should be same with right
assert(compareResults(true, 0.000001, left, right))
}
}
test("use weight") {
withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
val model = new XGBoostRegressor(xgbParam)
.fit(originalDf)
val model2 = new XGBoostRegressor(xgbParam)
.setWeightCol("weight")
.fit(dfWithWeight)
val left = model.transform(testDf).collect()
val right = model2.transform(testDf).collect()
// left should be different with right
assert(!compareResults(true, 0.000001, left, right))
}
}
test("Save model and transform GPU dataset") {
// Train a model on GPU
val (gpuModel, testDf) = withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
"num_round" -> 10, "num_workers" -> 1)
val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
val classifier = new XGBoostRegressor(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.setTreeMethod("gpu_hist")
(classifier.fit(rawInput), testDf)
}
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
gpuModel.write.overwrite().save(xgbrModel)
val gpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
// transform on GPU
withGpuSparkSession() { spark =>
val left = gpuModel
.transform(testDf)
.select(labelName, "prediction")
.collect()
val right = gpuModelFromFile
.transform(testDf)
.select(labelName, "prediction")
.collect()
assert(compareResults(true, 0.000001, left, right))
}
}
test("Model trained on CPU can transform GPU dataset") {
// Train a model on CPU
val cpuModel = withCpuSparkSession() { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
"num_round" -> 10, "num_workers" -> 1)
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
val vectorAssembler = new VectorAssembler()
.setHandleInvalid("keep")
.setInputCols(featureNames.toArray)
.setOutputCol("features")
val trainingDf = vectorAssembler.transform(rawInput).select("features", labelName)
val classifier = new XGBoostRegressor(xgbParam)
.setFeaturesCol("features")
.setLabelCol(labelName)
.setTreeMethod("auto")
classifier.fit(trainingDf)
}
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
cpuModel.write.overwrite().save(xgbrModel)
val cpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
// transform on GPU
withGpuSparkSession() { spark =>
val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
// Since CPU model does not know the information about the features cols that GPU transform
// pipeline requires. End user needs to setFeaturesCols in the model manually
val thrown = intercept[IllegalArgumentException](cpuModel
.transform(testDf)
.collect())
assert(thrown.getMessage.contains("Gpu transform requires features columns. " +
"please refer to setFeaturesCols"))
val left = cpuModel
.setFeaturesCols(featureNames)
.transform(testDf)
.collect()
val right = cpuModelFromFile
.setFeaturesCols(featureNames)
.transform(testDf)
.collect()
assert(compareResults(true, 0.000001, left, right))
}
}
test("Model trained on GPU can transform CPU dataset") {
// Train a model on GPU
val gpuModel = withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
"num_round" -> 10, "num_workers" -> 1)
val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
val classifier = new XGBoostRegressor(xgbParam)
.setFeaturesCols(featureNames)
.setLabelCol(labelName)
.setTreeMethod("gpu_hist")
classifier.fit(rawInput)
}
val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
gpuModel.write.overwrite().save(xgbrModel)
val gpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
// transform on CPU
withCpuSparkSession() { spark =>
val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
val featureColName = "feature_col"
val vectorAssembler = new VectorAssembler()
.setHandleInvalid("keep")
.setInputCols(featureNames.toArray)
.setOutputCol(featureColName)
val testDf = vectorAssembler.transform(rawInput).select(featureColName, labelName)
// Since GPU model does not know the information about the features col name that CPU
// transform pipeline requires. End user needs to setFeaturesCol in the model manually
val thrown = intercept[IllegalArgumentException](
gpuModel
.transform(testDf)
.collect())
assert(thrown.getMessage.contains("features does not exist"))
val left = gpuModel
.setFeaturesCol(featureColName)
.transform(testDf)
.select(labelName, "prediction")
.collect()
val right = gpuModelFromFile
.setFeaturesCol(featureColName)
.transform(testDf)
.select(labelName, "prediction")
.collect()
assert(compareResults(true, 0.000001, left, right))
}
}
test("Ranking: train with Group") {
withGpuSparkSession(enableCsvConf()) { spark =>
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "rank:pairwise",
"num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
"features_cols" -> featureNames, "label_col" -> labelName)
val Array(trainingDf, testDf) = spark.read.option("header", "true").schema(schema)
.csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
val model = new XGBoostRegressor(xgbParam)
.setGroupCol(groupName)
.fit(trainingDf)
val ret = model.transform(testDf).collect()
assert(testDf.count() === ret.length)
}
}
}