[jvm-packages] Support Ranker (#10823)
This commit is contained in:
@@ -93,7 +93,8 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
|
||||
selectedCols.append(col)
|
||||
}
|
||||
val input = dataset.select(selectedCols.toArray: _*)
|
||||
estimator.repartitionIfNeeded(input)
|
||||
val repartitioned = estimator.repartitionIfNeeded(input)
|
||||
estimator.sortPartitionIfNeeded(repartitioned)
|
||||
}
|
||||
|
||||
// visible for testing
|
||||
|
||||
@@ -16,14 +16,14 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ai.rapids.cudf.Table
|
||||
import ai.rapids.cudf.{OrderByArg, Table}
|
||||
import ml.dmlc.xgboost4j.java.CudfColumnBatch
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix, XGBoost => ScalaXGBoost}
|
||||
import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
|
||||
import ml.dmlc.xgboost4j.scala.rapids.spark.SparkSessionHolder.withSparkSession
|
||||
import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
|
||||
import org.apache.spark.ml.linalg.DenseVector
|
||||
import org.apache.spark.sql.{Dataset, SparkSession}
|
||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
import java.io.File
|
||||
@@ -94,7 +94,9 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
|
||||
}
|
||||
|
||||
// spark.rapids.sql.enabled is not set explicitly, default to true
|
||||
withSparkSession(new SparkConf(), spark => {checkIsEnabled(spark, true)})
|
||||
withSparkSession(new SparkConf(), spark => {
|
||||
checkIsEnabled(spark, true)
|
||||
})
|
||||
|
||||
// set spark.rapids.sql.enabled to false
|
||||
withCpuSparkSession() { spark =>
|
||||
@@ -503,6 +505,109 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
|
||||
}
|
||||
}
|
||||
|
||||
test("The group col should be sorted in each partition") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
val df = Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3")
|
||||
|
||||
val xgboostParams: Map[String, Any] = Map(
|
||||
"device" -> "cuda",
|
||||
"objective" -> "rank:ndcg"
|
||||
)
|
||||
val features = Array("c1", "c2", "c3")
|
||||
val label = "label"
|
||||
val group = "group"
|
||||
|
||||
val ranker = new XGBoostRanker(xgboostParams)
|
||||
.setFeaturesCol(features)
|
||||
.setLabelCol(label)
|
||||
.setNumWorkers(1)
|
||||
.setNumRound(1)
|
||||
.setGroupCol(group)
|
||||
.setDevice("cuda")
|
||||
|
||||
val processedDf = ranker.getPlugin.get.asInstanceOf[GpuXGBoostPlugin].preprocess(ranker, df)
|
||||
processedDf.rdd.foreachPartition { iter => {
|
||||
var prevGroup = Int.MinValue
|
||||
while (iter.hasNext) {
|
||||
val curr = iter.next()
|
||||
val group = curr.asInstanceOf[Row].getAs[Int](1)
|
||||
assert(prevGroup <= group)
|
||||
prevGroup = group
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Ranker: XGBoost-Spark should match xgboost4j") {
|
||||
withGpuSparkSession() { spark =>
|
||||
import spark.implicits._
|
||||
|
||||
val trainPath = writeFile(Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3"))
|
||||
val testPath = writeFile(Ranking.test.toDF("label", "weight", "group", "c1", "c2", "c3"))
|
||||
|
||||
val df = spark.read.parquet(trainPath)
|
||||
val testdf = spark.read.parquet(testPath)
|
||||
|
||||
val features = Array("c1", "c2", "c3")
|
||||
val featuresIndices = features.map(df.schema.fieldIndex)
|
||||
val label = "label"
|
||||
val group = "group"
|
||||
|
||||
val numRound = 100
|
||||
val xgboostParams: Map[String, Any] = Map(
|
||||
"device" -> "cuda",
|
||||
"objective" -> "rank:ndcg"
|
||||
)
|
||||
|
||||
val ranker = new XGBoostRanker(xgboostParams)
|
||||
.setFeaturesCol(features)
|
||||
.setLabelCol(label)
|
||||
.setNumRound(numRound)
|
||||
.setLeafPredictionCol("leaf")
|
||||
.setContribPredictionCol("contrib")
|
||||
.setGroupCol(group)
|
||||
.setDevice("cuda")
|
||||
|
||||
val xgb4jModel = withResource(new GpuColumnBatch(
|
||||
Table.readParquet(new File(trainPath)
|
||||
).orderBy(OrderByArg.asc(df.schema.fieldIndex(group))))) { batch =>
|
||||
val cb = new CudfColumnBatch(batch.select(featuresIndices),
|
||||
batch.select(df.schema.fieldIndex(label)), null, null,
|
||||
batch.select(df.schema.fieldIndex(group)))
|
||||
val qdm = new QuantileDMatrix(Seq(cb).iterator, ranker.getMissing,
|
||||
ranker.getMaxBins, ranker.getNthread)
|
||||
ScalaXGBoost.train(qdm, xgboostParams, numRound)
|
||||
}
|
||||
|
||||
val (xgb4jLeaf, xgb4jContrib, xgb4jPred) = withResource(new GpuColumnBatch(
|
||||
Table.readParquet(new File(testPath)))) { batch =>
|
||||
val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null
|
||||
)
|
||||
val qdm = new DMatrix(cb, ranker.getMissing, ranker.getNthread)
|
||||
(xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm),
|
||||
xgb4jModel.predict(qdm))
|
||||
}
|
||||
|
||||
val rows = ranker.fit(df).transform(testdf).collect()
|
||||
|
||||
// Check Leaf
|
||||
val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))
|
||||
checkEqual(xgb4jLeaf, xgbSparkLeaf)
|
||||
|
||||
// Check contrib
|
||||
val xgbSparkContrib = rows.map(row =>
|
||||
row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))
|
||||
checkEqual(xgb4jContrib, xgbSparkContrib)
|
||||
|
||||
// Check prediction
|
||||
val xgbSparkPred = rows.map(row =>
|
||||
Array(row.getAs[Double]("prediction").toFloat))
|
||||
checkEqual(xgb4jPred, xgbSparkPred)
|
||||
}
|
||||
}
|
||||
|
||||
def writeFile(df: Dataset[_]): String = {
|
||||
def listFiles(directory: String): Array[String] = {
|
||||
val dir = new File(directory)
|
||||
|
||||
@@ -81,6 +81,6 @@ object Regression extends TrainTestData {
|
||||
}
|
||||
|
||||
object Ranking extends TrainTestData {
|
||||
val train = generateRankDataset(300, 10, 555)
|
||||
val test = generateRankDataset(150, 10, 556)
|
||||
val train = generateRankDataset(300, 10, 12, 555)
|
||||
val test = generateRankDataset(150, 10, 12, 556)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user