[jvm-packages] Fix #3489: Spark repartitionForData can potentially shuffle all data and lose ordering required for ranking objectives (#3654)

2018-10-03 08:43:55 -07:00
parent d594b11f35
commit efc4f85505
5 changed files with 274 additions and 109 deletions
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -173,7 +173,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
    val training2 = training1.withColumn("margin", functions.rand())
    val test = buildDataFrame(Classification.test)
    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "test_train_split" -> "0.5",
+      "objective" -> "binary:logistic", "train_test_ratio" -> "1.0",
      "num_round" -> 5, "num_workers" -> numWorkers)

    val xgb = new XGBoostClassifier(paramMap)
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -19,6 +19,7 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.nio.file.Files
 import java.util.concurrent.LinkedBlockingDeque
 import ml.dmlc.xgboost4j.java.Rabit
+import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import ml.dmlc.xgboost4j.scala.DMatrix
 import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
@@ -71,18 +72,16 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
    assert(collectedAllReduceResults.poll().sameElements(maxVec))
  }

-  test("build RDD containing boosters with the specified worker number") {
+  test("distributed training with the specified worker number") {
    val trainingRDD = sc.parallelize(Classification.train)
-    val partitionedRDD = XGBoost.repartitionForTraining(trainingRDD, 2)
-    val boosterRDD = XGBoost.buildDistributedBoosters(
-      partitionedRDD,
+    val (booster, metrics) = XGBoost.trainDistributed(
+      trainingRDD,
      List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
        "objective" -> "binary:logistic").toMap,
-      new java.util.HashMap[String, String](),
-      round = 5, eval = null, obj = null, useExternalMemory = true,
-      missing = Float.NaN, prevBooster = null)
-    val boosterCount = boosterRDD.count()
-    assert(boosterCount === 2)
+      round = 5, nWorkers = numWorkers, eval = null, obj = null, useExternalMemory = false,
+      hasGroup = false, missing = Float.NaN)
+
+    assert(booster != null)
  }

  test("training with external memory cache") {
@@ -235,4 +234,37 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
    assert(error(prevModel._booster) > error(nextModel._booster))
    assert(error(nextModel._booster) < 0.1)
  }
+
+  test("repartitionForTrainingGroup with group data") {
+    // test different splits to cover the corner cases.
+    for (split <- 1 to 20) {
+      val trainingRDD = sc.parallelize(Ranking.train, split)
+      val traingGroupsRDD = XGBoost.repartitionForTrainingGroup(trainingRDD, 4)
+      val trainingGroups: Array[Array[XGBLabeledPoint]] = traingGroupsRDD.collect()
+      // check the the order of the groups with group id.
+      // Ranking.train has 20 groups
+      assert(trainingGroups.length == 20)
+
+      // compare all points
+      val allPoints = trainingGroups.sortBy(_(0).group).flatten
+      assert(allPoints.length == Ranking.train.size)
+      for (i <- 0 to Ranking.train.size - 1) {
+        assert(allPoints(i).group == Ranking.train(i).group)
+        assert(allPoints(i).label == Ranking.train(i).label)
+        assert(allPoints(i).values.sameElements(Ranking.train(i).values))
+      }
+    }
+  }
+
+  test("distributed training with group data") {
+    val trainingRDD = sc.parallelize(Ranking.train, 2)
+    val (booster, metrics) = XGBoost.trainDistributed(
+      trainingRDD,
+      List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+        "objective" -> "binary:logistic").toMap,
+      round = 5, nWorkers = numWorkers, eval = null, obj = null, useExternalMemory = false,
+      hasGroup = true, missing = Float.NaN)
+
+    assert(booster != null)
+  }
 }