[jvm-packages] Support Ranker (#10823)

This commit is contained in:
Bobby Wang
2024-09-22 02:02:15 +08:00
committed by GitHub
parent d7599e095b
commit 19b55b300b
6 changed files with 558 additions and 10 deletions

View File

@@ -93,7 +93,8 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
selectedCols.append(col)
}
val input = dataset.select(selectedCols.toArray: _*)
estimator.repartitionIfNeeded(input)
val repartitioned = estimator.repartitionIfNeeded(input)
estimator.sortPartitionIfNeeded(repartitioned)
}
// visible for testing

View File

@@ -16,14 +16,14 @@
package ml.dmlc.xgboost4j.scala.spark
import ai.rapids.cudf.Table
import ai.rapids.cudf.{OrderByArg, Table}
import ml.dmlc.xgboost4j.java.CudfColumnBatch
import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix, XGBoost => ScalaXGBoost}
import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
import ml.dmlc.xgboost4j.scala.rapids.spark.SparkSessionHolder.withSparkSession
import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.SparkConf
import java.io.File
@@ -94,7 +94,9 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
}
// spark.rapids.sql.enabled is not set explicitly, default to true
withSparkSession(new SparkConf(), spark => {checkIsEnabled(spark, true)})
withSparkSession(new SparkConf(), spark => {
checkIsEnabled(spark, true)
})
// set spark.rapids.sql.enabled to false
withCpuSparkSession() { spark =>
@@ -503,6 +505,109 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
}
}
test("The group col should be sorted in each partition") {
withGpuSparkSession() { spark =>
import spark.implicits._
val df = Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3")
val xgboostParams: Map[String, Any] = Map(
"device" -> "cuda",
"objective" -> "rank:ndcg"
)
val features = Array("c1", "c2", "c3")
val label = "label"
val group = "group"
val ranker = new XGBoostRanker(xgboostParams)
.setFeaturesCol(features)
.setLabelCol(label)
.setNumWorkers(1)
.setNumRound(1)
.setGroupCol(group)
.setDevice("cuda")
val processedDf = ranker.getPlugin.get.asInstanceOf[GpuXGBoostPlugin].preprocess(ranker, df)
processedDf.rdd.foreachPartition { iter => {
var prevGroup = Int.MinValue
while (iter.hasNext) {
val curr = iter.next()
val group = curr.asInstanceOf[Row].getAs[Int](1)
assert(prevGroup <= group)
prevGroup = group
}
}
}
}
}
test("Ranker: XGBoost-Spark should match xgboost4j") {
withGpuSparkSession() { spark =>
import spark.implicits._
val trainPath = writeFile(Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3"))
val testPath = writeFile(Ranking.test.toDF("label", "weight", "group", "c1", "c2", "c3"))
val df = spark.read.parquet(trainPath)
val testdf = spark.read.parquet(testPath)
val features = Array("c1", "c2", "c3")
val featuresIndices = features.map(df.schema.fieldIndex)
val label = "label"
val group = "group"
val numRound = 100
val xgboostParams: Map[String, Any] = Map(
"device" -> "cuda",
"objective" -> "rank:ndcg"
)
val ranker = new XGBoostRanker(xgboostParams)
.setFeaturesCol(features)
.setLabelCol(label)
.setNumRound(numRound)
.setLeafPredictionCol("leaf")
.setContribPredictionCol("contrib")
.setGroupCol(group)
.setDevice("cuda")
val xgb4jModel = withResource(new GpuColumnBatch(
Table.readParquet(new File(trainPath)
).orderBy(OrderByArg.asc(df.schema.fieldIndex(group))))) { batch =>
val cb = new CudfColumnBatch(batch.select(featuresIndices),
batch.select(df.schema.fieldIndex(label)), null, null,
batch.select(df.schema.fieldIndex(group)))
val qdm = new QuantileDMatrix(Seq(cb).iterator, ranker.getMissing,
ranker.getMaxBins, ranker.getNthread)
ScalaXGBoost.train(qdm, xgboostParams, numRound)
}
val (xgb4jLeaf, xgb4jContrib, xgb4jPred) = withResource(new GpuColumnBatch(
Table.readParquet(new File(testPath)))) { batch =>
val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null
)
val qdm = new DMatrix(cb, ranker.getMissing, ranker.getNthread)
(xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm),
xgb4jModel.predict(qdm))
}
val rows = ranker.fit(df).transform(testdf).collect()
// Check Leaf
val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))
checkEqual(xgb4jLeaf, xgbSparkLeaf)
// Check contrib
val xgbSparkContrib = rows.map(row =>
row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))
checkEqual(xgb4jContrib, xgbSparkContrib)
// Check prediction
val xgbSparkPred = rows.map(row =>
Array(row.getAs[Double]("prediction").toFloat))
checkEqual(xgb4jPred, xgbSparkPred)
}
}
def writeFile(df: Dataset[_]): String = {
def listFiles(directory: String): Array[String] = {
val dir = new File(directory)

View File

@@ -81,6 +81,6 @@ object Regression extends TrainTestData {
}
object Ranking extends TrainTestData {
val train = generateRankDataset(300, 10, 555)
val test = generateRankDataset(150, 10, 556)
val train = generateRankDataset(300, 10, 12, 555)
val test = generateRankDataset(150, 10, 12, 556)
}