[jvm-packages] Fix #3489: Spark repartitionForData can potentially shuffle all data and lose ordering required for ranking objectives (#3654)
This commit is contained in:
@@ -173,7 +173,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
val training2 = training1.withColumn("margin", functions.rand())
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "test_train_split" -> "0.5",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "1.0",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
|
||||
@@ -19,6 +19,7 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
import java.nio.file.Files
|
||||
import java.util.concurrent.LinkedBlockingDeque
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||
@@ -71,18 +72,16 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
assert(collectedAllReduceResults.poll().sameElements(maxVec))
|
||||
}
|
||||
|
||||
test("build RDD containing boosters with the specified worker number") {
|
||||
test("distributed training with the specified worker number") {
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
val partitionedRDD = XGBoost.repartitionForTraining(trainingRDD, 2)
|
||||
val boosterRDD = XGBoost.buildDistributedBoosters(
|
||||
partitionedRDD,
|
||||
val (booster, metrics) = XGBoost.trainDistributed(
|
||||
trainingRDD,
|
||||
List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap,
|
||||
new java.util.HashMap[String, String](),
|
||||
round = 5, eval = null, obj = null, useExternalMemory = true,
|
||||
missing = Float.NaN, prevBooster = null)
|
||||
val boosterCount = boosterRDD.count()
|
||||
assert(boosterCount === 2)
|
||||
round = 5, nWorkers = numWorkers, eval = null, obj = null, useExternalMemory = false,
|
||||
hasGroup = false, missing = Float.NaN)
|
||||
|
||||
assert(booster != null)
|
||||
}
|
||||
|
||||
test("training with external memory cache") {
|
||||
@@ -235,4 +234,37 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
assert(error(prevModel._booster) > error(nextModel._booster))
|
||||
assert(error(nextModel._booster) < 0.1)
|
||||
}
|
||||
|
||||
test("repartitionForTrainingGroup with group data") {
|
||||
// test different splits to cover the corner cases.
|
||||
for (split <- 1 to 20) {
|
||||
val trainingRDD = sc.parallelize(Ranking.train, split)
|
||||
val traingGroupsRDD = XGBoost.repartitionForTrainingGroup(trainingRDD, 4)
|
||||
val trainingGroups: Array[Array[XGBLabeledPoint]] = traingGroupsRDD.collect()
|
||||
// check the the order of the groups with group id.
|
||||
// Ranking.train has 20 groups
|
||||
assert(trainingGroups.length == 20)
|
||||
|
||||
// compare all points
|
||||
val allPoints = trainingGroups.sortBy(_(0).group).flatten
|
||||
assert(allPoints.length == Ranking.train.size)
|
||||
for (i <- 0 to Ranking.train.size - 1) {
|
||||
assert(allPoints(i).group == Ranking.train(i).group)
|
||||
assert(allPoints(i).label == Ranking.train(i).label)
|
||||
assert(allPoints(i).values.sameElements(Ranking.train(i).values))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("distributed training with group data") {
|
||||
val trainingRDD = sc.parallelize(Ranking.train, 2)
|
||||
val (booster, metrics) = XGBoost.trainDistributed(
|
||||
trainingRDD,
|
||||
List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap,
|
||||
round = 5, nWorkers = numWorkers, eval = null, obj = null, useExternalMemory = false,
|
||||
hasGroup = true, missing = Float.NaN)
|
||||
|
||||
assert(booster != null)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user